本文整理汇总了Python中utils.Regex.Regex.replaceData方法的典型用法代码示例。如果您正苦于以下问题:Python Regex.replaceData方法的具体用法?Python Regex.replaceData怎么用?Python Regex.replaceData使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类utils.Regex.Regex
的用法示例。
在下文中一共展示了Regex.replaceData方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SaraivaScrapper
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class SaraivaScrapper(QThread):
notifySaraiva = pyqtSignal(object)
def __init__(self, urlList, category, htmlTag, replaceTag):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
self.urlList = urlList
self.category = category
self.htmlTag = self.regex.replaceData('\r+', '', htmlTag)
self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag)
self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag)
self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag)
self.replaceTag = replaceTag
self.csvWriter = Csv(category + '.csv')
csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture']
self.csvWriter.writeCsvRow(csvDataHeader)
self.mainUrl = 'http://busca.livrariasaraiva.com.br'
self.scrapUrl = None
self.dbHelper = DbHelper('saraiva.db')
self.dbHelper.createTable(category)
self.total = self.dbHelper.getTotalProduct(category)
def run(self, retry=0):
try:
if self.urlList is not None and len(self.urlList):
for url in self.urlList:
if len(url) > 0:
url = self.regex.replaceData('(?i)\r', '', url)
url = self.regex.replaceData('(?i)\n', '', url)
self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url)
paginationUrl, self.maxRecords = self.reformatUrl(url)
self.notifySaraiva.emit(
'<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords))
print 'Max records: ', self.maxRecords
print 'URL: ' + str(paginationUrl)
sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev',
'&isort=price+rev',
'&isort=price', '&isort=date+rev']
for sort in sortList:
self.scrapResults(paginationUrl, sort)
self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>')
except Exception, x:
print x.message
self.logger.error('Exception at run: ', x.message)
if retry < 5:
self.run(retry + 1)
示例2: __init__
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class GoogleFinanceScrapper:
isFinished = False
def __init__(self, filename):
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
self.filename = filename
self.url = 'https://www.google.com/finance?'
self.main_url = 'https://www.google.com'
self.csvWriter = Csv('google_finance.csv')
csvDataHeader = ['Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape']
self.csvWriter.writeCsvRow(csvDataHeader)
def run(self):
self.scrapData()
self.csvWriter.closeWriter()
def scrapData(self):
try:
file = open(self.filename, 'rb')
for line in file.readlines():
if self.isFinished: return
line = self.regex.replaceData('\r+', '', line)
line = self.regex.reduceNewLine(line)
line = self.regex.reduceBlankSpace(line)
line = line.strip()
params = urllib.urlencode({'q': line})
url = self.url + params
self.scrapBykeyword(url, line)
except Exception, x:
print x
self.logger.error('Error: ' + x.message)
示例3: __init__
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class TopsyScrapper:
isFinished = False
def __init__(self, filename):
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
self.filename = filename
self.url = 'http://topsy.com/s?'
self.csvWriter = Csv('topsy.csv')
csvDataHeader = ['Keyword', 'Tweets in last 30 days', 'Topsy Sentiment Score', ' Date of scrape']
self.csvWriter.writeCsvRow(csvDataHeader)
def run(self):
self.scrapData()
self.csvWriter.closeWriter()
def scrapData(self):
try:
file = open(self.filename, 'rb')
for line in file.readlines():
if self.isFinished: return
line = self.regex.replaceData('\r+', '', line)
line = self.regex.reduceNewLine(line)
line = self.regex.reduceBlankSpace(line)
line = line.strip()
if len(line) > 0:
params = urllib.urlencode({'q': line, 'window': 'm', 'type': 'tweet'})
url = self.url + params
self.scrapBrowserData(url, line)
except Exception, x:
print x
示例4: AmazonScrapper
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class AmazonScrapper(QThread):
notifyAmazon = pyqtSignal(object)
def __init__(self, urlList, category):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
self.urlList = urlList
self.category = category
dupCsvReader = Csv()
self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv')
self.csvWriter = Csv(category + '.csv')
csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL']
if csvDataHeader not in self.dupCsvRows:
self.dupCsvRows.append(csvDataHeader)
self.csvWriter.writeCsvRow(csvDataHeader)
self.mainUrl = 'http://www.amazon.com'
self.scrapUrl = None
self.dbHelper = DbHelper('amazon.db')
self.dbHelper.createTable(category)
self.total = self.dbHelper.getTotalProduct(category)
def run(self, retry=0):
try:
# self.scrapProductDetail(
# 'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544')
# return
if self.urlList is not None and len(self.urlList):
for url in self.urlList:
if len(url) > 0:
url = self.regex.replaceData('(?i)\r', '', url)
url = self.regex.replaceData('(?i)\n', '', url)
self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url)
imUrl = None
retry = 0
while imUrl is None and retry < 4:
imUrl = self.reformatUrl(url)
retry += 1
if imUrl is None:
imUrl = url
self.total = 0
print 'URL: ' + str(imUrl)
sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority',
'date-desc-rank']
for sort in sortList:
self.scrapReformatData(imUrl, sort)
self.notifyAmazon.emit(
'<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url)
self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>')
except Exception, x:
print x.message
self.logger.error('Exception at run: ', x.message)
if retry < 5:
self.run(retry + 1)
示例5: PaodeacucarScrapper
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class PaodeacucarScrapper(QThread):
notifyPaode = pyqtSignal(object)
def __init__(self):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
self.mainUrl = 'http://www.paodeacucar.com.br/'
self.url = 'http://www.paodeacucar.com.br/'
dupCsvReader = Csv()
self.dupCsvRows = dupCsvReader.readCsvRow('paodeacucar.csv', 4)
self.csvWriter = Csv('paodeacucar.csv')
csvDataHeader = ['SKU', 'Category', 'Subcategory', 'Name', 'URL', 'URL Image', 'Details',
'Nutrients Table html code', 'Price from, 28/abr/14', '28/abr/14']
if 'URL' not in self.dupCsvRows:
self.dupCsvRows.append(csvDataHeader)
self.csvWriter.writeCsvRow(csvDataHeader)
def run(self):
self.scrapData()
def scrapData(self):
try:
print 'Main URL: ', self.url
self.notifyPaode.emit(('<font color=green><b>Main URL: %s</b></font>' % self.url))
data = self.spider.fetchData(self.url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
soup = BeautifulSoup(data)
categories = soup.find('nav', class_='items-wrapper').find_all('li', class_=re.compile('\s*item\s*'))
print 'Total Categories: ', len(categories)
self.notifyPaode.emit(('<font color=black><b>Total Categories: %s</b></font>' % str(len(categories))))
for category in categories:
if category.a is not None:
submenu_target = self.regex.replaceData('#', '', category.a.get('data-target'))
sub_categories = soup.find('ul', id=submenu_target).find_all('li', class_='item')
print 'Total Sub Categories: ', len(sub_categories)
self.notifyPaode.emit(('<font color=black><b>Total Subcategories: %s</b></font>' % str(len(sub_categories))))
for sub_category in sub_categories:
sub_category_label = sub_category.find('span', class_='label').text
sub_category_url = sub_category.a.get('href') if sub_category.a is not None else 'N/A'
self.scrapItems(sub_category_url, category.text, sub_category_label)
except Exception, x:
self.logger.error(x.message)
print x
示例6: AmazonScrapper
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class AmazonScrapper():
def __init__(self, url):
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
self.url = url
self.base_product_url = 'http://www.amazon.com/dp/'
self.base_image_url = 'http://ecx.images-amazon.com/images/I/'
self.csvWriter = Csv('amazon.csv')
csvDataHeader = ['URL', 'HTML Path', 'Image URLS']
self.csvWriter.writeCsvRow(csvDataHeader)
def scrapData(self):
try:
host = ('Host', 'www.amazon.com')
data = self.spider.fetchData(self.url, host=host)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
searchParams = self.regex.getSearchedData('(?i)var searchParams = {([^\}]*)}', data)
searchParams = searchParams.split(',')
seller = ''
marketPlaceId = ''
useMYI = ''
for searchParam in searchParams:
searchParam = self.regex.reduceBlankSpace(searchParam)
searchParam = self.regex.replaceData('\'', '', searchParam)
if searchParam.startswith('seller'):
seller = searchParam.split(':')[1].strip()
seller = seller.decode('string-escape')
if searchParam.startswith('marketplaceID'):
marketPlaceId = searchParam.split(':')[1].strip()
marketPlaceId = marketPlaceId.decode('string-escape')
if searchParam.startswith('useMYI'):
useMYI = searchParam.split(':')[1].strip()
useMYI = useMYI.decode('string-escape')
params = {'seller': seller,
'marketPlaceId': marketPlaceId,
'useMYI': useMYI}
ajax_url = 'http://www.amazon.com/gp/aag/ajax/productWidget.html'
self.scrapAjaxPage(ajax_url, params, host)
except Exception, x:
print x
示例7: Scrapper
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class Scrapper(QThread):
notifyScrapper = pyqtSignal(object)
isFinished = False
def __init__(self, urllist):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
print urllist
self.urllist = urllist
self.csv = Csv('scrapper.csv')
def run(self):
self.scrapData()
self.notifyScrapper.emit(
'<font color=green><b>------------------ Finish! ------------------------- </b></font>')
def scrapData(self):
try:
total = 0
csvHeader = ['URL', 'Title', 'Price', 'Brand', 'Features', 'Material', 'Measurements', 'Category',
'Size', 'Color', 'Design']
self.csv.writeCsvRow(csvHeader)
if self.isFinished: return
for url in self.urllist:
if len(url) > 0:
url = self.regex.replaceData('(?i)\r', '', url)
url = self.regex.replaceData('(?i)\n', '', url)
url = self.regex.getSearchedData('(?i)(http.*?)$', url)
print 'URL: ', url
self.notifyScrapper.emit(('<font color=green><b>URL: %s</b></font>' % url))
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
soup = BeautifulSoup(data)
soup.prettify()
title = ''
price = ''
size = ''
brand = ''
features = ''
material = ''
measurements = ''
category = ''
color = ''
design = ''
if soup.find('span', id='vi-lkhdr-itmTitl') is not None:
title = soup.find('span', id='vi-lkhdr-itmTitl').text
if soup.find('span', id='prcIsum'):
price = soup.find('span', id='prcIsum').text
if soup.find('div', class_='itemAttr'):
specchunk = soup.find('div', class_='itemAttr')
rows = specchunk.find_all('tr')
for row in rows:
cols = row.find_all('td')
for i in range(0, len(cols), 2):
# if self.regex.isFoundPattern('(?i)Condition:', cols[i].text.strip()):
# conditionChunk = cols[i + 1]
# conditionChunk = self.regex.replaceData(u'(?i)<span class="infoLink u-nowrap" id="readFull">.*?</span>', '', unicode(conditionChunk))
# conditionChunk = self.regex.replaceData(u'(?i)<b class="g-hdn">.*?</b>', '', conditionChunk)
# condition = BeautifulSoup(conditionChunk).text
# print condition
if self.regex.isFoundPattern('(?i)Brand:', cols[i].text.strip()):
brand = cols[i + 1].text
if self.regex.isFoundPattern('(?i)Features:', cols[i].text.strip()):
features = cols[i + 1].text
if self.regex.isFoundPattern('(?i)Material:', cols[i].text.strip()):
material = cols[i + 1].text
if self.regex.isFoundPattern('(?i)Measurements:', cols[i].text.strip()):
measurements = cols[i + 1].text
if self.regex.isFoundPattern('(?i)Category:', cols[i].text.strip()):
category = cols[i + 1].text
if self.regex.isFoundPattern('(?i)Color:', cols[i].text.strip()):
color = cols[i + 1].text
if self.regex.isFoundPattern('(?i)Design:', cols[i].text.strip()):
design = cols[i + 1].text
if self.regex.isFoundPattern('(?i)Size:', cols[i].text.strip()):
size = cols[i + 1].text
self.notifyScrapper.emit('<font color=black><b>Writting data to csv file.</b></font>')
csvData = [url, title, price, brand, features, material, measurements, category, size, color, design]
self.notifyScrapper.emit('<font color=black><b>Data: %s</b></font>' % unicode(csvData))
self.csv.writeCsvRow(csvData)
self.notifyScrapper.emit('<font color=black><b>Successfully Written data to csv file.</b></font>')
total += 1
self.notifyScrapper.emit('<font color=green><b>Total Data scrapped: [%s]</b></font>' % str(total))
except Exception, x:
self.notifyScrapper.emit('<font color=red><b>Error scrapping category: %s</b></font>' % x.message)
self.logger.error(x.message)
print x
示例8: CsCat
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
#.........这里部分代码省略.........
self.dupCsvRows.append(csvData)
else:
self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)
subCategories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if subCategories and len(subCategories) > 0:
self.totalCategory += len(subCategories)
self.notifyCategory.emit(
'<font color=green><b>Total Category Found [%s]</b></font>' % unicode(self.totalCategory))
for subCategory in subCategories:
print subCategory
self.scrapSubCategory(subCategory[0], categoryName, subCategory[1])
def scrapSubCategory(self, url, rootCategoryName, categoryName):
self.notifyCategory.emit('<font color=green><b>Start scraping URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
self.filterCategory(data, categoryName)
categoryDesc = self.regex.getSearchedData('(?i)<div class="category-description std">([^<]*)</div>', data)
categoryDesc = unicode(categoryDesc).strip()
csvData = [rootCategoryName, categoryName, categoryDesc]
if csvData not in self.dupCsvRows:
self.csvWriter.writeCsvRow(csvData)
self.dupCsvRows.append(csvData)
self.notifyCategory.emit('<b>Scraped Data: %s</b>' % unicode(csvData))
else:
self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)
subCategories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if subCategories and len(subCategories) > 0:
self.totalCategory += len(subCategories)
self.notifyCategory.emit(
'<font color=green><b>Total Category Found [%s]</b></font>' % unicode(self.totalCategory))
for subCategory in subCategories:
self.scrapFinalCategory(subCategory[0], categoryName, subCategory[1])
def scrapFinalCategory(self, url, rootCategoryName, categoryName):
self.notifyCategory.emit('<font color=green><b>Start scraping URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
self.filterCategory(data, categoryName)
categoryDesc = self.regex.getSearchedData(u'(?i)<div class="category-description std">([^<]*)</div>', data)
if len(categoryDesc) > 0:
categoryDesc = categoryDesc.strip()
csvData = [rootCategoryName, categoryName, categoryDesc]
if csvData not in self.dupCsvRows:
self.csvWriter.writeCsvRow(csvData)
self.dupCsvRows.append(csvData)
self.notifyCategory.emit('<b>Scraped Data: %s</b>' % unicode(csvData))
else:
self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)
def filterCategory(self, data, categoryName):
# self.csvW = Csv(category + '.csv')
filterData = self.regex.getSearchedData('(?i)<h4>Filter your results</h4> <dl id="narrow-by-list">(.*?)</dl>',
data)
if filterData and len(filterData) > 0:
self.notifyCategory.emit('<b>Filter Data found writing to csv</b>')
allFilters = self.regex.getAllSearchedData('(?i)<dt>([^<]*)</dt> <dd>(.*?)</dd>', filterData)
topData = [categoryName]
childData = []
maxLen = 0
for allFilter in allFilters:
topData.append(allFilter[0])
print 'Filter: ' + allFilter[0]
filterName = self.regex.replaceData('(?i)<span class="price">', '', allFilter[1])
filterName = self.regex.replaceData('(?i)</span>', '', filterName)
filters = self.regex.getAllSearchedData('(?i)<a href=[^>]*>([^<]*)</a>', filterName)
if filters is not None and len(filters) > 0:
childData.append(filters)
if len(filters) > maxLen:
maxLen = len(filters)
if topData not in self.dupFilterCsvRows:
self.csvW.writeCsvRow(topData)
self.notifyCategory.emit(
'<font color=green><b>Filters Found For Category [%s].</b></font> <br /><b>Filters are: %s</b>' % (
unicode(categoryName), unicode(topData[1:])))
else:
self.notifyCategory.emit('<font color=green><b>Already scraped Filter For Category [%s]. Skip it.</b></font>' % categoryName)
return
for row in range(maxLen):
rowData = ['']
for columnData in childData:
if len(columnData) > row:
rowData.append(columnData[row])
else:
rowData.append('')
print rowData
self.csvW.writeCsvRow(rowData)
else:
self.notifyCategory.emit(
'<font color=green><b>No Filter Found For Category[%s].</b></font>' % categoryName)
示例9: Regex
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
# -*- coding: utf-8 -*-
from utils.Regex import Regex
__author__ = "Rabbi-Tuly"
if __name__ == "__main__":
text = """<table style="width: 187px;" align="center" background=" border="0" cellpadding="0" cellspacing="0"> <tbody> <tr> <td colspan="2"><img src="http://img.photobucket.com/albums/v402/WGT-32/1.png" /> <a href="http://eshops.mercadolivre.com.br/quackstore/" target="_blank"><img src="http://img.photobucket.com/albums/v402/WGT-32/2.png" border="0" /></a> <img src="http://img.photobucket.com/albums/v402/WGT-32/3.png" /> <table style="width: 100%;" border="0" cellpadding="0" cellspacing="0"> <tbody> <tr> <td style="padding-left: 60px;" background="http://img.photobucket.com/albums/v402/WGT-32/fundo.png"> <p> </p> <p>Você está convidado a juntar-se a um grupo que durante uma semana vai estudar com um dos maiores especialistas em liderança dos Estados Unidos. Leonard Hoffman, um famoso empresário que abandonou sua brilhante carreira para se tornar monge em um mosteiro beneditino, é o personagem central desta envolvente história criada por James C. Hunter para ensinar de forma clara e agradável os princÃpios fundamentais dos verdadeiros lÃderes. Se você tem dificuldade em fazer com que sua equipe dê o melhor de si no trabalho e gostaria de se relacionar melhor com sua famÃlia e seus amigos, vai encontrar neste livro personagens, idéias e discussões que vão abrir um novo horizonte em sua forma de lidar com os outros. É impossÃvel ler este livro sem sair transformado. "O Monge e o Executivo" é, sobretudo, uma lição sobre como se tornar uma pessoa melhor.<br /> <br /> <b>I.S.B.N.: </b>8575421026<br /> <br /> <b>Cód. Barras: </b>9788575421024<br /> <br /> <b>Reduzido: </b>149181<br /> <br /> <b>Altura: </b>21 cm.<br /><b>Largura: </b>14 cm.<br /><b>Acabamento : </b>Brochura<br /><b>Edição : </b>1 / 2004<br /><b>Idioma : </b>Português<br /><b>PaÃs de Origem : </b>Brasil<br /><b>Número de Paginas : </b>144<br /> <br /></p> <p> </p> <p style="text-align: center;"></p> </td> </tr> </tbody> </table> <img src="http://img.photobucket.com/albums/v402/WGT-32/5.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/6.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/7.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/8_usa_pf.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/10.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/11.png" /></td> </tr> <tr> <td valign="top" width="186"></td> <td valign="top" width="1"></td> </tr> <tr> <td colspan="2"></td> </tr> </tbody> </table>"""
regex = Regex()
d = regex.replaceData(r"(?i)(?:<br\s*/>\s*)+", "<br />", text)
print d
示例10: YoutubeScrapper
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class YoutubeScrapper(object):
def __init__(self):
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
def scrapVideoDownloadUrl(self, url, filename=None):
data = self.spider.fetchData(url)
if data and len(data) > 0:
title = self.scrapTitle(url)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data)
dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk)
dlUrlParts = dlUrlChunk.split(',')
sig = ''
video = ''
videoUrl = ''
print dlUrlParts
for dlUrlPart in dlUrlParts:
dlUrlPart = urllib2.unquote(dlUrlPart)
print dlUrlPart
## TODO
if self.regex.isFoundPattern('(?i)itag=22', dlUrlPart) or self.regex.isFoundPattern('(?i)itag=18',
dlUrlPart):
urlPart = dlUrlPart.split(' ')
for part in urlPart:
print part
if self.regex.isFoundPattern('(?i)sig=.*?', part):
sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part)
if self.regex.isFoundPattern('(?i)url=.*?', part):
video = self.regex.getSearchedData('(?i)url=(.*?)$', part)
print video
videoUrl = video + '&signature=' + sig
self.downloadDir = './natok.mp4'
print 'Video URL= ' + videoUrl
print self.downloadDir
break
# dlPath = './natok.mp4' if filename is None else filename
fname = self.regex.replaceData('\s+', '_', title)
dlPath = './' + fname + '.mp4' if filename is None else filename
print dlPath
print '\n\n'
if self.downloadFile(videoUrl, dlPath) is True:
print 'Download complete'
else:
print 'No data found.'
def scrapTitle(self, url):
# https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml
xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml'
data = self.spider.fetchData(xmlUrl)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
print data
return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data)
def downloadFile(self, url, downloadPath, retry=0):
try:
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0))
opener.addheaders = [
('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Connection', 'keep-alive')]
# resp = opener.open(url, timeout=10)
resp = urllib2.urlopen(url, timeout=60)
print 'ok'
print resp.info()
contentLength = resp.info()['Content-Length']
contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength)
totalSize = float(contentLength)
directory = os.path.dirname(downloadPath)
if not os.path.exists(directory):
os.makedirs(directory)
currentSize = 0
dl_file = open(downloadPath, 'ab')
try:
if os.path.getsize(downloadPath):
start = os.path.getsize(downloadPath)
currentSize = start
opener.addheaders.append(('Range', 'bytes=%s-' % (start)))
except Exception, x:
print x
res = opener.open(url, timeout=60)
CHUNK_SIZE = 256 * 1024
while True:
data = res.read(CHUNK_SIZE)
# data = resp.read(CHUNK_SIZE)
#.........这里部分代码省略.........
示例11: CsProduct
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
#.........这里部分代码省略.........
price = ''
if priceChunk and len(priceChunk) > 0:
price = self.regex.getSearchedData('(?i)([0-9,.]+)', priceChunk)
deliveryChunk = self.regex.getSearchedData('(?i)<div class="delivery">(.*?)</div>', data)
delivery = ''
if deliveryChunk and len(deliveryChunk) > 0:
delivery = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', deliveryChunk)
warrantyChunk = self.regex.getSearchedData('(?i)<div class="warranty">(.*?)</div>', data)
warranty = ''
if warrantyChunk and len(warrantyChunk) > 0:
warranty = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', warrantyChunk)
## Download and save product images
productImageUrl = self.regex.getSearchedData(
'(?i)src="(http://assets.cs-catering-equipment.co.uk/media/catalog/product/cache/1/image/256x/[^"]*)"',
data)
print productImageUrl
productImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', productImageUrl)
if productImage and len(productImage) > 0:
print productImage
self.notifyProduct.emit(
'<font color=green><b>Downloading Product Image [%s]. Please wait...</b></font>' % productImage)
self.downloadFile(productImageUrl, 'product_image/' + productImage)
self.notifyProduct.emit('<font color=green><b>Downloaded Product Image [%s].</b></font>' % productImage)
# self.utils.downloadFile(productImageUrl, 'product_image/' + productImage)
## Download and save brand images
brandImageUrl = self.regex.getSearchedData(
'(?i)<div class="manufacturer-box-left"><a href="[^"]*"[^>]*?><img src="([^"]*)"', data)
brandImage = ''
if brandImageUrl and len(brandImageUrl) > 0:
brandImageUrl = self.regex.replaceData('(?i)logo/', '', brandImageUrl)
brandImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', brandImageUrl)
if brandImage and len(brandImage) > 0:
self.notifyProduct.emit(
'<font color=green><b>Downloading Brand Image [%s]. Please wait...</b></font>' % brandImage)
# self.utils.downloadFile(brandImageUrl, 'brand_image/' + brandImage)
self.downloadFile(brandImageUrl, 'brand_image/' + brandImage)
self.notifyProduct.emit('<font color=green><b>Downloaded Brand Image [%s].</b></font>' % brandImage)
csvData = [url, productCode, productName, manufacturer, listPrice, price, savePrice,
productShortDesc, productFullDesc, productTechnicalDesc, warranty, delivery,
productImage,
category1Name, category2Name, category3Name, category4Name, brandImage]
self.csvWriter.writeCsvRow(csvData)
self.logger.debug(unicode(csvData))
self.notifyProduct.emit('<b>Product Details: %s</b>' % unicode(csvData))
def downloadFile(self, url, downloadPath, retry=0):
print url
self.notifyProduct.emit('<b>File URL: %s.</b>' % url)
try:
socket.setdefaulttimeout(10)
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0))
opener.addheaders = [
('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1')]
urllib2.install_opener(opener)
# resp = opener.open(url, timeout=30)
# resp = urllib2.urlopen(url, timeout=30)
resp = None
示例12: BetrosProduct
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
#.........这里部分代码省略.........
categoryChunk = self.regex.getSearchedData('(?i)<div id="contenuto1">(.*?)</div>\s*?</div>', data)
if categoryChunk and len(categoryChunk) > 0:
categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryChunk)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
categoryName = category[1].strip()
self.scrapSubCategory(str(category[0]).strip(), categoryName, dupCsvRows, csvWriter)
def scrapSubCategory(self, url, categoryName, dupCsvRows, csvWriter):
self.logger.debug('Category URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap subcategories for: %s</b>' % categoryName)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
subCategories = self.regex.getAllSearchedData('(?i)<li\s*?><a href="([^"]*)" title="([^"]*)"', data)
if subCategories and len(subCategories) > 0:
self.notifyProduct.emit(
'<font color=green><b>Total subcategories found %s.</b></font>' % str(len(subCategories)))
for subCategory in subCategories:
subCategoryName = subCategory[1].strip()
self.scrapProducts(subCategory[0].strip(), categoryName, subCategoryName, dupCsvRows, csvWriter)
def scrapProducts(self, url, categoryName, subCategoryName, dupCsvRows, csvWriter):
self.logger.debug('Product URL: ' + url)
self.notifyProduct.emit('<b>Product URL: %s.</b>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categoryDescription = self.regex.getSearchedData(
'(?i)<td class="prodottidescrizione1">\s*?<h1>[^<]*?</h1>(.*?)</td>', data)
categoryDescription = self.regex.replaceData('(?i)<!--.*?-->', '', categoryDescription)
categoryDescription = self.regex.replaceData('(?i)<[^>]*>', '', categoryDescription)
productUrl = self.regex.getSearchedData('(?i)^(http://.*?)/', url)
categoryImage = self.regex.getSearchedData(
'(?i)<div class="boximgcat" id="boximgcatid">\s*?<a rel="shadowbox" href="([^"]*)"', data)
categoryImageName = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_. ]*)$', categoryImage)
categoryImageName = self.regex.replaceData('\s+', '_', categoryImageName.strip())
if categoryImageName is not None and len(categoryImageName) > 0 and not os.path.exists(
'category_image/' + categoryImageName):
self.notifyProduct.emit(
'<font color=green><b>Downloading Category Image: </b>%s <b>Please Wait...</b></font>' % categoryImageName)
self.downloadFile(productUrl + categoryImage, 'category_image/' + categoryImageName)
# self.utils.downloadFile(categoryImage, 'category_image/' + categoryImageName)
self.notifyProduct.emit(
'<font color=green><b>Downloaded Category Image: %s.</b></font>' % categoryImageName)
productChunks = self.regex.getSearchedData('(?i)<table.*?class="prodottiriga"[^>]*?>(.*?)</table>', data)
if productChunks and len(productChunks) > 0:
productChunk = self.regex.getAllSearchedData('(?i)<tr>(.*?</div>\s*?</td>)\s*?</tr>', productChunks)
for products in productChunk:
print 'url: ' + url
code = self.regex.getSearchedData('(?i)Cod\. ([a-zA-Z0-9 /]+)', products).strip()
for dup in dupCsvRows:
if code == dup[4]:
return
model = self.regex.getSearchedData('(?i)Mod\. ([^<]*)<', products).strip()
productName = self.regex.getSearchedData('(?i)<h1>([^<]*)</h1>', products).strip()
self.notifyProduct.emit(
'<font color=green><b>Product Name: %s.</b></font>' % productName)
desc = self.regex.getSearchedData(
'(?i)<div id="prtdescrizione\d+" style="display:none">(.*?)<div class="prodotticomando">',
products).strip()
productImage = productUrl + self.regex.getSearchedData('(?i)<img src="/tpl/\.\.([^"]*)"',
示例13: WebPageToPdf
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class WebPageToPdf(QObject):
threadPdfStatusBar = QtCore.pyqtSignal(object)
threadPdfWritingStatus = QtCore.pyqtSignal(object)
threadPdfWritingDone = QtCore.pyqtSignal(int)
def __init__(self):
QObject.__init__(self)
self.regex = Regex()
self.title = ''
self.webView = QWebView()
self.webView.settings().setAttribute(QWebSettings.AutoLoadImages, True)
self.webView.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
self.webView.settings().setAttribute(QWebSettings.PluginsEnabled, True)
self.webView.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
self.pdfPrinter = QPrinter()
self.webView.loadFinished.connect(self.convertToPdf)
def setupDefaultPdfPrinter(self, fileName):
self.pdfPrinter.setOrientation(QPrinter.Portrait)
self.pdfPrinter.setPageSize(QPrinter.A4)
self.pdfPrinter.setOutputFormat(QPrinter.PdfFormat)
self.pdfPrinter.setOutputFileName(fileName)
def printWebHtmlToPdf(self, url, filePath, fileName, groupType):
self.tempPdfFile = filePath + 'out.pdf'
self.filePath = filePath
self.fileName = fileName
self.url = url
self.groupType = groupType
self.setupDefaultPdfPrinter(self.tempPdfFile)
self.threadPdfStatusBar.emit('Fetching Data From Web. Please Wait...')
# self.threadPdfWritingStatus.emit(
# '<font size=4 color=green><b>Method "%s": </b></font><font color=green><b>Fetching Data From Web for</b> %s<b>.<br />Please Wait...</b></font>' % (
# self.groupType, self.url))
self.threadPdfWritingStatus.emit(
'<font color=green><b>Fetching Data From Web for</b> %s<b>.<br />Please Wait...</b></font>' % self.url)
self.webView.load(QUrl(url))
self.title = self.webView.title()
def convertToPdf(self):
print 'Generating Pdf'
# self.threadPdfWritingStatus.emit(
# '<font size=4><b>Method "%s": </b></font><b>Generating Pdf for</b> %s<b>. Please Wait...</b>' % (
# self.groupType, self.url))
self.threadPdfWritingStatus.emit(
'<b>Generating Pdf for</b> %s<b>. Please Wait...</b>' % self.url)
self.threadPdfStatusBar.emit('Generating Pdf. Please Wait...')
self.webView.print_(self.pdfPrinter)
print 'Generated Pdf'
# self.threadPdfWritingStatus.emit(
# '<font size=4><b>Method "%s": </b></font><b>Generated Pdf for</b> %s<b>. Please Wait...</b>' % (
# self.groupType, self.url))
self.threadPdfWritingStatus.emit(
'<b>Generated Pdf for</b> %s<b>. Please Wait...</b>' % self.url)
self.threadPdfStatusBar.emit('Generated Pdf.')
self.mergePdf()
self.threadPdfWritingDone.emit(True)
def mergePdf(self):
# self.threadPdfWritingStatus.emit(
# '<font size=4><b>Method "%s": </b></font><b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % (
# self.groupType, self.url))
self.threadPdfWritingStatus.emit(
'<b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % self.url)
packet = StringIO()
# create a new PDF with Reportlab
pdfCanvas = canvas.Canvas(packet, pagesize=A4)
pdfCanvas.setFont('Helvetica', 8)
if len(self.title) is 0:
self.title = str(self.url).split('/')[-1]
self.title = self.regex.getSearchedData('(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', self.title)
self.title = self.regex.replaceData('(?i)_', ' ', self.title)
title = unicode(self.title[:57] + '...') if (len(self.title) > 60) else unicode(self.title)
url = self.url[:57] + '...' if (len(self.title) > 60) else self.url
pdfCanvas.drawString(5, 830, title + ' ' + str(url).lower())
d = datetime.datetime.now()
strDate = str(d.strftime("%Y-%m-%d %H-%M-%S %p"))
pdfCanvas.drawString(420, 5, 'Created Date Time: ' + strDate)
pdfCanvas.save()
packet.seek(0)
newPdf = PdfFileReader(packet)
if not os.path.exists(self.tempPdfFile):
return self.printWebHtmlToPdf(self.url, self.filePath, self.fileName)
writer = PdfFileWriter()
tmpPdfFile = file(self.tempPdfFile, 'rb')
reader = PdfFileReader(tmpPdfFile)
for i in range(0, (reader.getNumPages())):
page = reader.getPage(i)
page.mergePage(newPdf.getPage(0))
# page = newPdf.getPage(0)
# page.mergePage(reader.getPage(i))
writer.addPage(page)
print 'Filename: ' + self.fileName
outputStream = file(self.filePath + self.fileName, "wb")
writer.write(outputStream)
outputStream.close()
#.........这里部分代码省略.........
示例14: YtDownloadManager
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class YtDownloadManager(object):
def __init__(self):
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
def scrapVideoDownloadUrl(self, url):
data = self.spider.fetchData(url)
print data
soup = BeautifulSoup(data)
exit(1)
if data and len(data) > 0:
title = self.scrapTitle(url)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data)
dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk)
dlUrlParts = dlUrlChunk.split(',')
sig = ''
video = ''
videoUrl = ''
print dlUrlParts
for dlUrlPart in dlUrlParts:
dlUrlPart = urllib2.unquote(dlUrlPart)
print dlUrlPart
# if self.regex.isFoundPattern('(?i)itag=5', dlUrlPart):
urlPart = dlUrlPart.split(' ')
for part in urlPart:
print part
if self.regex.isFoundPattern('(?i)sig=.*?', part):
sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part)
if self.regex.isFoundPattern('(?i)url=.*?', part):
video = self.regex.getSearchedData('(?i)url=(.*?)$', part)
print video
videoUrl = video + '&signature=' + sig
self.downloadDir = './test.flv'
# print 'Video URL= ' + videoUrl
# print self.downloadDir
# dlPath = './test.flv'
# print dlPath
print '\n\n'
# if self.downloadFile(videoUrl, dlPath) is True:
# break
def scrapTitle(self, url):
# https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml
xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml'
data = self.spider.fetchData(xmlUrl)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data)
def downloadFile(self, url, downloadPath, retry=0):
try:
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0))
opener.addheaders = [
('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Connection', 'keep-alive')]
# resp = opener.open(url, timeout=10)
resp = urllib2.urlopen(url, timeout=30)
print resp.info()
contentLength = resp.info()['Content-Length']
contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength)
totalSize = float(contentLength)
directory = os.path.dirname(downloadPath)
if not os.path.exists(directory):
os.makedirs(directory)
dl_file = open(downloadPath, 'wb')
currentSize = 0
CHUNK_SIZE = 32768
while True:
data = resp.read(CHUNK_SIZE)
if not data:
break
currentSize += len(data)
dl_file.write(data)
print('============> ' + \
str(round(float(currentSize * 100) / totalSize, 2)) + \
'% of ' + str(totalSize) + ' bytes')
notifyDl = '===> Downloaded ' + str(round(float(currentSize * 100) / totalSize, 2)) + '% of ' + str(
totalSize) + ' KB.'
if currentSize >= totalSize:
dl_file.close()
return True
except Exception, x:
error = 'Error downloading: ' + x
return False
示例15: CsProduct
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
#.........这里部分代码省略.........
def scrapProductsDetails(self, url, category1Name, category2Name, category3Name, category4Name):
self.logger.debug('Product Details URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all products under Category[%s]</b>' % category4Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url + '?limit=10000&mode=list')
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
products = self.regex.getAllSearchedData('(?i)<div class="listing-item[^"]*?">(.*?)</div>', data)
if products and len(products) > 0:
self.totalProducts += len(products)
self.notifyProduct.emit('<font color=green><b>Total Products Found [%s]</b></font>' % unicode(self.totalProducts))
for product in products:
productDetailUrl = self.regex.getSearchedData('(?i)<a href="([^"]*)"', product)
if productDetailUrl not in self.dupCsvRows:
self.scrapProductDetails(productDetailUrl, category1Name, category2Name, category3Name,
category4Name)
else:
self.notifyProduct.emit(
'<font color=green><b>Already Exists This Product Under Category[%s]. Skip It.</b></font>' % category4Name)
def scrapProductDetails(self, url, category1Name, category2Name, category3Name, category4Name):
self.logger.debug('Product Detail URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap product details under Category[%s]</b>' % category4Name)
self.notifyProduct.emit('<font color=green><b>Product Detail URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
manufacturer = self.regex.getSearchedData(
'(?i)<span class="manufacturer-box-label">Manufacturer:</span>([^<]*)</p>', data)
productCode = self.regex.getSearchedData(
'(?i)<span class="manufacturer-box-label">Model No:</span>([^<]*)</p>',
data)
productName = self.regex.getSearchedData('(?i)<div class="product-name"> <h1>([^<]*)</h1>', data)
productTechnicalDesc = self.regex.getSearchedData('(?i)<div class="product-short-description">([^<]*)</div>'
, data)
productDescriptions = self.regex.getSearchedData('(?i)<div class="product-specs">(.*?)</div>', data)
productShortDesc = ''
productFullDesc = ''
if productDescriptions and len(productDescriptions) > 0:
print 'desc: ' + productDescriptions
productShortDesc = self.regex.getSearchedData('(?i)<p>(.*?)</p>', productDescriptions)
productFullDesc = '\n'.join(
self.regex.getAllSearchedData('(?i)<li>([^<]*)</li>', productDescriptions))
listPriceChunk = self.regex.getSearchedData('(?i)<div class="rrp-price regular-price">(.*?)</div>', data)
listPrice = ''
if listPriceChunk and len(listPriceChunk) > 0:
listPrice = self.regex.getSearchedData('(?i)([0-9,.]+)', listPriceChunk)
savePriceChunk = self.regex.getSearchedData('(?i)<div class="regular-price saving-price">(.*?)</div>', data)
savePrice = ''
if savePriceChunk and len(savePriceChunk) > 0:
savePrice = self.regex.getSearchedData('(?i)([0-9%]+)', savePriceChunk)
priceChunk = self.regex.getSearchedData('(?i)<div class="[^"]*" id="product-price-\d+">(.*?)</div>', data)
price = ''
if priceChunk and len(priceChunk) > 0:
price = self.regex.getSearchedData('(?i)([0-9,.]+)', priceChunk)
deliveryChunk = self.regex.getSearchedData('(?i)<div class="delivery">(.*?)</div>', data)
delivery = ''
if deliveryChunk and len(deliveryChunk) > 0:
delivery = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', deliveryChunk)
warrantyChunk = self.regex.getSearchedData('(?i)<div class="warranty">(.*?)</div>', data)
warranty = ''
if warrantyChunk and len(warrantyChunk) > 0:
warranty = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', warrantyChunk)
## Download and save product images
productImageUrl = self.regex.getSearchedData(
'(?i)src="(http://assets.cs-catering-equipment.co.uk/media/catalog/product/cache/1/image/256x/[^"]*)"',
data)
print productImageUrl
productImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', productImageUrl)
if productImage and len(productImage) > 0:
print productImage
self.notifyProduct.emit('<b>Downloading Product Image [%s]. Please wait...</b>' % productImage)
self.utils.downloadFile(productImageUrl, 'product_image/' + productImage)
## Download and save brand images
brandImageUrl = self.regex.getSearchedData(
'(?i)<div class="manufacturer-box-left"><a href="[^"]*"[^>]*?><img src="([^"]*)"', data)
brandImage = ''
if brandImageUrl and len(brandImageUrl) > 0:
brandImageUrl = self.regex.replaceData('(?i)logo/', '', brandImageUrl)
brandImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', brandImageUrl)
if brandImage and len(brandImage) > 0:
self.notifyProduct.emit('<b>Downloading Brand Image [%s]. Please wait...</b>' % brandImage)
self.utils.downloadFile(brandImageUrl, 'brand_image/' + brandImage)
csvData = [url, productCode, productName, manufacturer, listPrice, price, savePrice,
productShortDesc, productFullDesc, productTechnicalDesc, warranty, delivery,
productImage,
category1Name, category2Name, category3Name, category4Name, brandImage]
self.csvWriter.writeCsvRow(csvData)
self.logger.debug(unicode(csvData))
self.notifyProduct.emit('<b>Product Details: %s</b>' % unicode(csvData))