本文整理汇总了Python中logs.LogManager.LogManager.debug方法的典型用法代码示例。如果您正苦于以下问题:Python LogManager.debug方法的具体用法?Python LogManager.debug怎么用?Python LogManager.debug使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类logs.LogManager.LogManager
的用法示例。
在下文中一共展示了LogManager.debug方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: NisbetProduct
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class NisbetProduct(QtCore.QThread):
scrapProductData = QtCore.pyqtSignal(object)
stopThread = QtCore.pyqtSignal(int)
def __init__(self):
QtCore.QThread.__init__(self)
self.isExiting = False
self.totalProducts = 0
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
dupCsvReader = Csv()
self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0)
self.csvWriter = Csv('nisbets.csv')
self.mainUrl = 'http://www.nisbets.co.uk'
csvHeaderList = ['URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand',
'Product Price', 'Product Short Description',
'Product Long Description', 'Image File Name', 'User Manual File Name',
'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1',
'Category2', 'Category3',
'Category4']
if 'URL' not in self.dupCsvRows:
self.csvWriter.writeCsvRow(csvHeaderList)
self.dupCsvRows.append(csvHeaderList[0])
self.utils = Utils()
def run(self):
self.scrapData()
def stop(self):
self.isExiting = True
def scrapData(self):
if self.isExiting: return
self.scrapProductData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl)
self.logger.debug('===== URL [' + self.mainUrl + '] =====')
data = self.spider.fetchData(self.mainUrl)
if data and len(str(data).strip()) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
category1Chunk = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data)
if category1Chunk and len(str(category1Chunk).strip()) > 0:
i = 0
for category1Data in category1Chunk:
category1 = self.regex.getSearchedData('(?i)<a href="[^"]*">([^<]*)</a>', category1Data)
category2Chunk = self.regex.getAllSearchedData('(?i)<li><a href="([^"]*)">([^<]*)</a>',
category1Data)
if category2Chunk and len(str(category2Chunk).strip()) > 0:
for category2Data in category2Chunk:
try:
self.scrapCategory2Data(self.mainUrl + category2Data[0], category1, category2Data[1])
except Exception, x:
self.logger.error(x)
self.scrapProductData.emit('<font color=red><b>Finish Scraping Product data from %s</b></font>' % self.mainUrl)
示例2: __init__
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class Spider:
def __init__(self):
self.logger = LogManager(__name__)
self.opener = None
self.mycookie = None
def login(self, url, loginInfo, retry=0, proxy=None):
"""
Login request for user
url = '' Ex. http://www.example.com/login
loginInfo = {} Ex. {'user': 'user', 'pass': 'pass'}
"""
conn = ('Connection', 'keep-alive')
ac = ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
ln = ('Accept-Language', 'en-us,en;q=0.5')
if proxy is None:
self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler())
else:
self.opener = self.createOpener([config.USER_AGENT, conn, ac, ln], self.createCookieJarHandler(), proxy)
urllib2.install_opener(self.opener)
try:
response = self.opener.open(url, urllib.urlencode(loginInfo))
print 'Response from Server:'
print 'Status: ', response.getcode()
print response.info()
self.logger.debug('Response from Server:')
self.logger.debug('Status: ' + str(response.getcode()))
self.logger.debug(response.info())
redirected_url = response.url
return redirected_url, response.read()
except Exception, x:
print x
self.logger.error(x.message)
if retry < config.RETRY_COUNT:
print 'Retry again. Please wait 5 seconds...'
time.sleep(5)
self.login(url, loginInfo, retry + 1)
else:
print 'Failed to retrieve data after maximum %d retry!' % config.RETRY_COUNT
return None, None
示例3: NisbetProduct
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class NisbetProduct(QtCore.QThread):
scrapProductData = QtCore.pyqtSignal(object)
stopThread = QtCore.pyqtSignal(int)
def __init__(self):
QtCore.QThread.__init__(self)
self.isExiting = False
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
dupCsvReader = Csv()
self.dupCsvRows = dupCsvReader.readCsvRow("nisbets.csv", 0)
self.csvWriter = Csv("nisbets.csv")
self.mainUrl = "http://www.nisbets.co.uk"
csvHeaderList = [
"URL",
"Product Code",
"Product Technical Specifications",
"Product Name",
"Brand",
"Product Price",
"Product Short Description",
"Product Long Description",
"Image File Name",
"User Manual File Name",
"Exploded View File Name",
"Spares Code",
"Accessories",
"Product Status" "Category1",
"Category2",
"Category3",
"Category4",
]
if "URL" not in self.dupCsvRows:
self.csvWriter.writeCsvRow(csvHeaderList)
self.dupCsvRows.append(csvHeaderList[0])
self.utils = Utils()
def run(self):
self.scrapData()
def stop(self):
self.isExiting = True
def scrapData(self):
if self.isExiting:
return
self.scrapProductData.emit("<font color=green><b>Main URL: </b>%s</font>" % self.mainUrl)
self.logger.debug("===== URL [" + self.mainUrl + "] =====")
data = self.spider.fetchData(self.mainUrl)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
category1Chunk = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data)
if category1Chunk:
for category1Data in category1Chunk:
category1 = self.regex.getSearchedData('(?i)<a href="[^"]*">([^<]*)</a>', category1Data)
category2Chunk = self.regex.getAllSearchedData(
'(?i)<li><a href="([^"]*)">([^<]*)</a>', category1Data
)
if category2Chunk:
for category2Data in category2Chunk:
self.scrapCategory2Data(self.mainUrl + category2Data[0], category1, category2Data[1])
self.scrapProductData.emit("<font color=red><b>Finish Scraping Product data from %s</b></font>" % self.mainUrl)
def scrapCategory2Data(self, url, category1, category2):
if self.isExiting:
return
self.scrapProductData.emit("<b>Category 2 URL: </b>%s" % url)
self.logger.debug("== Category 2 URL [" + url + "] ==")
data = self.spider.fetchData(url)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
category3Chunks = self.regex.getSearchedData('(?i)<ul class="topCat clear-fix">(.*?)</ul>', data)
if category3Chunks:
category3Chunk = self.regex.getAllSearchedData('(?i)<a href="([^"]*)">([^<]*)<', category3Chunks)
if category3Chunk:
for category3Data in category3Chunk:
self.scrapCategory3Data(self.mainUrl + category3Data[0], category1, category2, category3Data[1])
def scrapCategory3Data(self, url, category1, category2, category3):
if self.isExiting:
return
self.scrapProductData.emit("<b>Category 3 URL: </b>%s" % url)
self.logger.debug("== Category 3 URL [" + url + "] ==")
data = self.spider.fetchData(url)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
category4Chunks = self.regex.getSearchedData('(?i)<ul class="topCat clear-fix">(.*?)</ul>', data)
if category4Chunks:
category4Chunk = self.regex.getAllSearchedData('(?i)<a href="([^"]*)">([^<]*)<', category4Chunks)
if category4Chunk:
for category4Data in category4Chunk:
category4Url = self.mainUrl + category4Data[0]
self.scrapCategory4Data(category4Url, category1, category2, category3, category4Data[1])
def scrapCategory4Data(self, url, category1, category2, category3, category4):
#.........这里部分代码省略.........
示例4: NisbetCat
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class NisbetCat(QtCore.QThread):
scrapCategoryData = QtCore.pyqtSignal(object)
stopThread = QtCore.pyqtSignal(int)
def __init__(self):
QtCore.QThread.__init__(self)
self.isExiting = False
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
dupCsvReader = Csv()
self.dupCsvRows = dupCsvReader.readCsvRow('nisbetCat.csv')
self.csvWriter = Csv('nisbetCat.csv')
self.mainUrl = 'http://www.nisbets.co.uk'
csvHeaderList = ['Parent Category', 'Category Name', 'Category Description']
if csvHeaderList not in self.dupCsvRows:
self.csvWriter.writeCsvRow(csvHeaderList)
self.dupCsvRows.append(csvHeaderList)
def run(self):
self.scrapData()
def stop(self):
self.isExiting = True
def scrapData(self):
if self.isExiting: return
self.scrapCategoryData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl)
self.logger.debug('===== URL [' + self.mainUrl + '] =====')
data = self.spider.fetchData(self.mainUrl)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
links = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+"> <a href="([^"]*)">([^<]*)</a>', data)
if links:
for link in links:
self.scrapCategoryData.emit('<b>Link URL: </b>%s' % (self.mainUrl + link[0]))
self.logger.debug('===Link URL [' + self.mainUrl + link[0] + '] ===')
csvData = ['Home']
category = link[1]
csvData.append(category)
linkInfo = self.spider.fetchData(self.mainUrl + link[0])
if linkInfo:
linkInfo = self.regex.reduceNewLine(linkInfo)
linkInfo = self.regex.reduceBlankSpace(linkInfo)
csvData.append(
self.regex.getSearchedData('(?i)<p class="br5px padding10 mb0 mt10">([^<]*)</p>', linkInfo))
self.logger.debug('Category ' + str(csvData))
if csvData not in self.dupCsvRows:
self.csvWriter.writeCsvRow(csvData)
self.dupCsvRows.append(csvData)
self.scrapCategoryData.emit('<b>Scraped Data: </b>%s<br />' % str(csvData))
else:
self.scrapCategoryData.emit(
'<font color=green><b>Already Scrapped Skip This Category</b></font>')
## After write first cat data
subUrlsChunk = self.regex.getSearchedData('(?i)<ul class="topCat clear-fix">(.*?)</ul>',
linkInfo)
if subUrlsChunk:
subUrls = self.regex.getAllSearchedData('(?i)<a href="([^"]*)">([^<]*)<span', subUrlsChunk)
if subUrls:
for subUrl in subUrls:
self.scrapSubCat(self.mainUrl + subUrl[0], category, subUrl[1])
self.scrapCategoryData.emit(
'<font color=red><b>Finish Scraping Category data from %s</b></font>' % self.mainUrl)
def scrapSubCat(self, url, parentCat, category):
if self.isExiting: return
self.scrapCategoryData.emit('<b>Link URL: </b>%s' % url)
self.logger.debug('== Sub URL [' + url + '] ==')
data = self.spider.fetchData(url)
if data:
csvData = [parentCat, category]
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
csvData.append(self.regex.getSearchedData('(?i)<p class="br5px padding10 mb0 mt10">([^<]*)</p>', data))
self.logger.debug('Sub Category ' + str(csvData))
if csvData not in self.dupCsvRows:
self.csvWriter.writeCsvRow(csvData)
self.dupCsvRows.append(csvData)
self.scrapCategoryData.emit('<b>Scraped Data: </b>%s<br />' % str(csvData))
else:
self.scrapCategoryData.emit('<font color=green><b>Already Scrapped Skip This Category</b></font>')
## After write first cat data
subUrlsChunk = self.regex.getSearchedData('(?i)<ul class="topCat clear-fix">(.*?)</ul>', data)
if subUrlsChunk:
subUrls = self.regex.getAllSearchedData('(?i)<a href="([^"]*)">([^<]*)<span', subUrlsChunk)
if subUrls:
for subUrl in subUrls:
self.scrapSubSubCat(self.mainUrl + subUrl[0], category, subUrl[1])
def scrapSubSubCat(self, url, parentCat, category):
if self.isExiting: return
self.scrapCategoryData.emit('<b>Link URL: </b>%s' % url)
self.logger.debug('== SUb SUb URL [' + url + '] ==')
data = self.spider.fetchData(url)
if data:
#.........这里部分代码省略.........
示例5: WpScrapper
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class WpScrapper():
def __init__(self, input_file, output_file):
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.utils = Utils()
self.input_file = input_file
self.output_file = output_file
def scrapData(self):
csv_writer = csv.writer(open(self.output_file, 'wb'), delimiter=';')
with open(self.input_file, 'rb') as csvfile:
csv_rows = csv.reader(csvfile, delimiter=';')
rows = list(csv_rows)
total = len(rows)
counter = 0
for row in rows:
counter += 1
print '---------------- Checking [%d] of [%d] records. ----------------------' % (counter, total)
self.logger.debug('Checking %d of %d records.' % (counter, total))
domain = 'http://' + row[0] + '/wp-login.php'
https_domain = 'https://' + row[0] + '/wp-login.php'
wp_admin = 'http://' + row[0] + '/wp-admin/'
https_wp_admin = 'https://' + row[0] + '/wp-admin/'
username = row[1]
password = row[2]
status = 0
print 'Login Credential => Domain: ' + domain + ' User: ' + username + ' Password: ' + password
self.logger.debug('Login Credential => Domain: ' + domain + ' User: ' + username + ' Password: ' + password)
if self.onLogin(domain, https_domain, wp_admin, https_wp_admin, username, password) is True:
print 'Successfully logged in.'
self.logger.debug('Successfully logged in.')
status = 1
else:
print 'Login failed!'
self.logger.debug('Login failed!')
csv_writer.writerow([row[0], username, password, status])
print '---------------- End of checking [%d] of [%d] records. ----------------------' % (counter, total)
print '\n\n'
def onLogin(self, url, https_url, wp_url, https_wp_url, username, password):
'''
Credentials are:
action login_access
i
p
password sdfsdf
username sdfsdf
'''
try:
loginCredentials = {'log': username,
'pwd': password,
'redirect_to': wp_url}
print 'Credentials', loginCredentials
print 'Please wait...Try to login with your credentials.'
redirected_url, loginData = self.spider.login(url, loginCredentials)
print 'redirected url: ', redirected_url
if loginData and len(loginData) > 0:
loginData = self.regex.reduceNewLine(loginData)
loginData = self.regex.reduceBlankSpace(loginData)
print 'After login data: ', loginData
if redirected_url is not None and redirected_url.strip() == wp_url.strip(): return True
# if loginData and len(loginData) > 0:
# loginData = self.regex.reduceNewLine(loginData)
# loginData = self.regex.reduceBlankSpace(loginData)
# soup = BeautifulSoup(loginData)
# if soup.find('div', {'id': 'login_error'}):
# return False
# else:
# return True
except Exception, x:
print x
print 'There was an error when login with http'
try:
https_loginCredentials = {'log': username,
'pwd': password,
'redirect_to': https_wp_url}
print 'Credentials', https_loginCredentials
print 'Please wait...Try to login with your credentials.'
https_redirected_url, https_login_data = self.spider.login(https_url, https_loginCredentials)
if https_redirected_url is not None and https_redirected_url.strip() == https_wp_url.strip(): return True
except Exception, x:
print x
print 'There was an error when login with https'
示例6: BetrosProduct
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class BetrosProduct(QThread):
notifyProduct = pyqtSignal(object)
def __init__(self):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.mainUrl = 'http://www.bertos.com'
self.utils = Utils()
self.csvHeader = ['Home Category', 'Sub Category', 'Category Description', 'Category Image', 'Code',
'Product Code',
'Product Name',
'Product Description', 'Product Image File', 'Technical Sheet File', 'Exploded View File']
self.totalProducts = 0
def run(self):
self.scrapBertos()
self.notifyProduct.emit('<font color=red><b>Finished Scraping All products.</b></font>')
def scrapBertos(self, retry=0):
# self.downloadFile('http://s900.bertos.it/download.php?file=editorcms/documentazione/schede/scheda_13722600.pdf', 'a.pdf')
# self.scrapSubCategory('http://s900.bertos.it/en/', '', None, None)
# self.scrapProducts('http://s900.bertos.it/en/pasta_cookers/', '', '', None, None)
# return
self.notifyProduct.emit('<font color=green><b>Try to get all language links.</b></font>')
self.logger.debug(self.mainUrl)
data = self.spider.fetchData(self.mainUrl)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
languages = self.regex.getAllSearchedData(
'(?i)<div class="[^"]*"><a href="([^"]*)"\s*?class="boxalingua">([^<]*)</a>', data)
if languages and len(languages) > 0:
self.logger.debug('Total languages: %s' % str(len(languages)))
self.notifyProduct.emit('<b>Total languages found[%s]</b>' % str(len(languages)))
for language in languages:
self.totalProducts = 0
url = language[0]
# if str(language[1]).lower() != 'en':
# continue
urlChunk = self.spider.fetchData(url)
if urlChunk and len(urlChunk) > 0:
urlChunk = self.regex.reduceNewLine(urlChunk)
urlChunk = self.regex.reduceBlankSpace(urlChunk)
url = self.regex.getSearchedData('(?i)<a href="([^"]*)" onmouseover="vedi_po_cat\(2\)\s*?"',
urlChunk)
csvFile = str(language[1].strip()).lower() + '_' + 'bertos.csv'
dupCsvReader = Csv()
dupCsvRows = dupCsvReader.readCsvRow(csvFile)
csvWriter = Csv(csvFile)
if self.csvHeader not in dupCsvRows:
dupCsvRows.append(self.csvHeader)
csvWriter.writeCsvRow(self.csvHeader)
self.notifyProduct.emit(
'<font color=green><b>Try to get data for language [%s].</b></font>' % language[1])
self.scrapCategory(url, dupCsvRows, csvWriter)
self.notifyProduct.emit(
'<font color=red><b>===== Finish scraping data for [%s] =====</b></font><br /><br />' %
language[1])
else:
if retry < 5:
return self.scrapBertos(retry + 1)
def scrapCategory(self, mainUrl, dupCsvRows, csvWriter):
url = mainUrl
self.logger.debug('Main URL: ' + url)
self.notifyProduct.emit('<font color=green><b>Main URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
data = self.regex.reduceNbsp(data)
self.notifyProduct.emit('<b>Try to scrap all categories.</b>')
categoryChunk = self.regex.getSearchedData('(?i)<div id="contenuto1">(.*?)</div>\s*?</div>', data)
if categoryChunk and len(categoryChunk) > 0:
categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryChunk)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
categoryName = category[1].strip()
self.scrapSubCategory(str(category[0]).strip(), categoryName, dupCsvRows, csvWriter)
def scrapSubCategory(self, url, categoryName, dupCsvRows, csvWriter):
self.logger.debug('Category URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap subcategories for: %s</b>' % categoryName)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
subCategories = self.regex.getAllSearchedData('(?i)<li\s*?><a href="([^"]*)" title="([^"]*)"', data)
if subCategories and len(subCategories) > 0:
self.notifyProduct.emit(
'<font color=green><b>Total subcategories found %s.</b></font>' % str(len(subCategories)))
for subCategory in subCategories:
subCategoryName = subCategory[1].strip()
self.scrapProducts(subCategory[0].strip(), categoryName, subCategoryName, dupCsvRows, csvWriter)
#.........这里部分代码省略.........
示例7: CsProduct
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class CsProduct(QThread):
notifyProduct = pyqtSignal(object)
def __init__(self):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
dupCsvReader = Csv()
self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 0)
self.csvWriter = Csv('cs_product.csv')
self.mainUrl = 'http://www.cs-catering-equipment.co.uk/'
self.utils = Utils()
self.csvWriter.writeCsvRow(
['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount',
'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty',
'Delivery',
'Product Image',
'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image'])
self.totalProducts = 0
def run(self):
self.scrapProduct()
self.notifyProduct.emit('<font color=red><b>Finished Scraping All products.</b></font>')
def scrapProduct(self):
self.logger.debug('Main URL: ' + self.mainUrl)
self.notifyProduct.emit('<font color=green><b>Main URL: %s</b></font>' % self.mainUrl)
data = self.spider.fetchData(self.mainUrl)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
self.notifyProduct.emit('<b>Try to scrap all categories.</b>')
categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)" class="level-top" title="([^"]*)"', data)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
category1Name = unicode(category[1]).strip()
self.scrapCategory1Data(str(category[0]).strip(), category1Name)
def scrapCategory1Data(self, url, category1Name):
self.logger.debug('Category 1 URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category1Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
self.scrapCategory2Data(category[0], category1Name, category[1])
def scrapCategory2Data(self, url, category1Name, category2Name):
self.logger.debug('Category 2 URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category2Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if categories and len(categories) > 0:
for category in categories:
print 'category2: ' + category[0]
self.scrapCategory3Data(category[0], category1Name, category2Name, category[1])
def scrapCategory3Data(self, url, category1Name, category2Name, category3Name):
self.logger.debug('Category 3 URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category3Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if categories and len(categories) > 0:
for category in categories:
print [category1Name, category2Name, category3Name, category[1]]
self.scrapProductsDetails(category[0], category1Name, category2Name, category3Name, category[1])
def scrapProductsDetails(self, url, category1Name, category2Name, category3Name, category4Name):
self.logger.debug('Product Details URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all products under Category[%s]</b>' % category4Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url + '?limit=10000&mode=list')
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
products = self.regex.getAllSearchedData('(?i)<div class="listing-item[^"]*?">(.*?)</div>', data)
if products and len(products) > 0:
self.totalProducts += len(products)
self.notifyProduct.emit('<font color=green><b>Total Products Found [%s]</b></font>' % unicode(self.totalProducts))
for product in products:
productDetailUrl = self.regex.getSearchedData('(?i)<a href="([^"]*)"', product)
if productDetailUrl not in self.dupCsvRows:
self.scrapProductDetails(productDetailUrl, category1Name, category2Name, category3Name,
category4Name)
else:
#.........这里部分代码省略.........
示例8: CsProduct
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class CsProduct(QThread):
notifyProduct = pyqtSignal(object)
def __init__(self):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
dupCsvReader = Csv()
self.dupCsvRows0 = dupCsvReader.readCsvRow('cs_product.csv', 0)
self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 1)
self.csvWriter = Csv('cs_product.csv')
self.mainUrl = 'http://www.cs-catering-equipment.co.uk/'
self.utils = Utils()
if 'Product Code' not in self.dupCsvRows:
self.csvWriter.writeCsvRow(
['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount',
'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty'
,
'Delivery',
'Product Image',
'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image'])
self.totalProducts = len(self.dupCsvRows)
def run(self):
self.scrapProduct()
self.notifyProduct.emit('<font color=red><b>Finished Scraping All products.</b></font>')
def scrapProduct(self):
# self.logger.debug('Main URL: ' + self.mainUrl)
self.notifyProduct.emit('<font color=green><b>Main URL: %s</b></font>' % self.mainUrl)
data = self.spider.fetchData(self.mainUrl)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
self.notifyProduct.emit('<b>Try to scrap all categories.</b>')
categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)" class="level-top" title="([^"]*)"', data)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
category1Name = unicode(category[1]).strip()
self.scrapCategory1Data(str(category[0]).strip(), category1Name)
def scrapCategory1Data(self, url, category1Name):
# self.logger.debug('Category 1 URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category1Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
self.scrapCategory2Data(category[0], category1Name, category[1])
def scrapCategory2Data(self, url, category1Name, category2Name):
# self.logger.debug('Category 2 URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category2Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if categories and len(categories) > 0:
for category in categories:
print 'category2: ' + category[0]
self.scrapCategory3Data(category[0], category1Name, category2Name, category[1])
def scrapCategory3Data(self, url, category1Name, category2Name, category3Name):
# self.logger.debug('Category 3 URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category3Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categories = self.regex.getAllSearchedData(
'(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
if categories and len(categories) > 0:
for category in categories:
print [category1Name, category2Name, category3Name, category[1]]
self.scrapProductsDetails(category[0], category1Name, category2Name, category3Name, category[1])
def scrapProductsDetails(self, url, category1Name, category2Name, category3Name, category4Name):
self.logger.debug('Product Details URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap all products under Category[%s]</b>' % category4Name)
self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
maxLimit = 25
maxLimitChunk = self.spider.fetchData(url + '?mode=list')
if maxLimitChunk and len(maxLimitChunk):
maxLimitChunk = self.regex.reduceNewLine(maxLimitChunk)
maxLimitChunk = self.regex.reduceBlankSpace(maxLimitChunk)
maxLimits = self.regex.getAllSearchedData('<option value="[^"]*limit=(\d+)[^"]*"', maxLimitChunk)
# print maxLimits
if maxLimits and len(maxLimits) > 0:
maxLimit = max(map(int, maxLimits))
# print maxLimit
#.........这里部分代码省略.........
示例9: __init__
# 需要导入模块: from logs.LogManager import LogManager [as 别名]
# 或者: from logs.LogManager.LogManager import debug [as 别名]
class Nisbets:
def __init__(self):
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.csvWriter = Csv('nisbets.csv')
self.mainUrl = 'http://www.nisbets.co.uk'
csvHeaderList = ['Category', 'Product Image Url', 'Product Code', 'Product Name', 'Price']
self.csvWriter.writeCsvRow(csvHeaderList)
def scrapData(self):
self.logger.debug('===== URL [' + self.mainUrl + '] =====')
data = self.spider.fetchData(self.mainUrl)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
data = self.regex.getSearchedData('(?i)<div class="cms-left-nav-category">(.*?)</ul>', data)
if data:
links = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"', data)
if links:
for link in links:
self.scrapLinkData(self.mainUrl + link)
def scrapLinkData(self, link):
self.logger.debug('== Link URL [' + link + '] ==')
data = self.spider.fetchData(link)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
data = self.regex.getSearchedData('(?i)<h3>Brand</h3> <ul class="subCat02 clear-fix">(.*?)</ul>', data)
if data:
links = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"', data)
if links:
for link in links:
self.scrapInfo(self.mainUrl + link)
def scrapInfo(self, link):
self.logger.debug('= Info URL [' + link + '] =')
data = self.spider.fetchData(link)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
category = self.regex.getSearchedData('(?i)<li><h3>Category</h3></li> <li class="remCont"> <span class="block">([^<]*)</span>', data)
allInfo = self.regex.getAllSearchedData('(?i)<div class="product-list-row clear-after">(.*?)</fieldset>', data)
if allInfo:
for info in allInfo:
csvData = []
csvData.append(category)
grpData = self.regex.getSearchedDataGroups('(?i)<img class="primaryImage" src="([^"]*)" alt="([^"]*)" />', info)
if grpData.group(1):
imageUrl = grpData.group(1)
imageUrl = self.regex.replaceData('(?i)medium', 'xlarge', imageUrl)
csvData.append(self.mainUrl + imageUrl)
else:
csvData.append('')
csvData.append(grpData.group(2))
name = self.regex.getSearchedData('(?i)<h3 class="product-name"> <a href="[^"]*">([^<]*)</a>', info)
csvData.append(name)
price = self.regex.getSearchedData(u'(?i)<div class="reduced-price"> <span class="bold">([^<]*)</span>', info)
csvData.append(price.strip()[1:])
self.logger.debug('Scraped Data ' + str(csvData))
self.csvWriter.writeCsvRow(csvData)