本文整理汇总了Python中utils.Regex.Regex.reduceNbsp方法的典型用法代码示例。如果您正苦于以下问题:Python Regex.reduceNbsp方法的具体用法?Python Regex.reduceNbsp怎么用?Python Regex.reduceNbsp使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类utils.Regex.Regex
的用法示例。
在下文中一共展示了Regex.reduceNbsp方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: BetrosProduct
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNbsp [as 别名]
class BetrosProduct(QThread):
notifyProduct = pyqtSignal(object)
def __init__(self):
QThread.__init__(self)
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.mainUrl = 'http://www.bertos.com'
self.utils = Utils()
self.csvHeader = ['Home Category', 'Sub Category', 'Category Description', 'Category Image', 'Code',
'Product Code',
'Product Name',
'Product Description', 'Product Image File', 'Technical Sheet File', 'Exploded View File']
self.totalProducts = 0
def run(self):
self.scrapBertos()
self.notifyProduct.emit('<font color=red><b>Finished Scraping All products.</b></font>')
def scrapBertos(self, retry=0):
# self.downloadFile('http://s900.bertos.it/download.php?file=editorcms/documentazione/schede/scheda_13722600.pdf', 'a.pdf')
# self.scrapSubCategory('http://s900.bertos.it/en/', '', None, None)
# self.scrapProducts('http://s900.bertos.it/en/pasta_cookers/', '', '', None, None)
# return
self.notifyProduct.emit('<font color=green><b>Try to get all language links.</b></font>')
self.logger.debug(self.mainUrl)
data = self.spider.fetchData(self.mainUrl)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
languages = self.regex.getAllSearchedData(
'(?i)<div class="[^"]*"><a href="([^"]*)"\s*?class="boxalingua">([^<]*)</a>', data)
if languages and len(languages) > 0:
self.logger.debug('Total languages: %s' % str(len(languages)))
self.notifyProduct.emit('<b>Total languages found[%s]</b>' % str(len(languages)))
for language in languages:
self.totalProducts = 0
url = language[0]
# if str(language[1]).lower() != 'en':
# continue
urlChunk = self.spider.fetchData(url)
if urlChunk and len(urlChunk) > 0:
urlChunk = self.regex.reduceNewLine(urlChunk)
urlChunk = self.regex.reduceBlankSpace(urlChunk)
url = self.regex.getSearchedData('(?i)<a href="([^"]*)" onmouseover="vedi_po_cat\(2\)\s*?"',
urlChunk)
csvFile = str(language[1].strip()).lower() + '_' + 'bertos.csv'
dupCsvReader = Csv()
dupCsvRows = dupCsvReader.readCsvRow(csvFile)
csvWriter = Csv(csvFile)
if self.csvHeader not in dupCsvRows:
dupCsvRows.append(self.csvHeader)
csvWriter.writeCsvRow(self.csvHeader)
self.notifyProduct.emit(
'<font color=green><b>Try to get data for language [%s].</b></font>' % language[1])
self.scrapCategory(url, dupCsvRows, csvWriter)
self.notifyProduct.emit(
'<font color=red><b>===== Finish scraping data for [%s] =====</b></font><br /><br />' %
language[1])
else:
if retry < 5:
return self.scrapBertos(retry + 1)
def scrapCategory(self, mainUrl, dupCsvRows, csvWriter):
url = mainUrl
self.logger.debug('Main URL: ' + url)
self.notifyProduct.emit('<font color=green><b>Main URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
data = self.regex.reduceNbsp(data)
self.notifyProduct.emit('<b>Try to scrap all categories.</b>')
categoryChunk = self.regex.getSearchedData('(?i)<div id="contenuto1">(.*?)</div>\s*?</div>', data)
if categoryChunk and len(categoryChunk) > 0:
categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryChunk)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
categoryName = category[1].strip()
self.scrapSubCategory(str(category[0]).strip(), categoryName, dupCsvRows, csvWriter)
def scrapSubCategory(self, url, categoryName, dupCsvRows, csvWriter):
self.logger.debug('Category URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap subcategories for: %s</b>' % categoryName)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
subCategories = self.regex.getAllSearchedData('(?i)<li\s*?><a href="([^"]*)" title="([^"]*)"', data)
if subCategories and len(subCategories) > 0:
self.notifyProduct.emit(
'<font color=green><b>Total subcategories found %s.</b></font>' % str(len(subCategories)))
for subCategory in subCategories:
subCategoryName = subCategory[1].strip()
self.scrapProducts(subCategory[0].strip(), categoryName, subCategoryName, dupCsvRows, csvWriter)
#.........这里部分代码省略.........
示例2: WebTable
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNbsp [as 别名]
#.........这里部分代码省略.........
# continue
urlChunk = self.spider.fetchData(url)
if urlChunk and len(urlChunk) > 0:
urlChunk = self.regex.reduceNewLine(urlChunk)
urlChunk = self.regex.reduceBlankSpace(urlChunk)
url = self.regex.getSearchedData('(?i)<a href="([^"]*)" onmouseover="vedi_po_cat\(2\)\s*?"',
urlChunk)
csvFile = str(language[1].strip()).lower() + '_' + 'bertos.csv'
dupCsvReader = Csv()
dupCsvRows = dupCsvReader.readCsvRow(csvFile)
csvWriter = Csv(csvFile)
if self.csvHeader not in dupCsvRows:
dupCsvRows.append(self.csvHeader)
csvWriter.writeCsvRow(self.csvHeader)
self.notifyProduct.emit(
'<font color=green><b>Try to get data for language [%s].</b></font>' % language[1])
self.scrapCategory(url, dupCsvRows, csvWriter)
self.notifyProduct.emit(
'<font color=red><b>===== Finish scraping data for [%s] =====</b></font><br /><br />' %
language[1])
else:
if retry < 5:
return self.scrapBertos(retry + 1)
def scrapCategory(self, mainUrl, dupCsvRows, csvWriter):
url = mainUrl
self.logger.debug('Main URL: ' + url)
self.notifyProduct.emit('<font color=green><b>Main URL: %s</b></font>' % url)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
data = self.regex.reduceNbsp(data)
self.notifyProduct.emit('<b>Try to scrap all categories.</b>')
categoryChunk = self.regex.getSearchedData('(?i)<div id="contenuto1">(.*?)</div>\s*?</div>', data)
if categoryChunk and len(categoryChunk) > 0:
categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryChunk)
if categories and len(categories) > 0:
self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
for category in categories:
categoryName = category[1].strip()
self.scrapSubCategory(str(category[0]).strip(), categoryName, dupCsvRows, csvWriter)
def scrapSubCategory(self, url, categoryName, dupCsvRows, csvWriter):
self.logger.debug('Category URL: ' + url)
self.notifyProduct.emit('<b>Try to scrap subcategories for: %s</b>' % categoryName)
data = self.spider.fetchData(url)
if data and len(data) > 0:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
subCategories = self.regex.getAllSearchedData('(?i)<li\s*?><a href="([^"]*)" title="([^"]*)"', data)
if subCategories and len(subCategories) > 0:
self.notifyProduct.emit(
'<font color=green><b>Total subcategories found %s.</b></font>' % str(len(subCategories)))
for subCategory in subCategories:
subCategoryName = subCategory[1].strip()
self.scrapProducts(subCategory[0].strip(), categoryName, subCategoryName, dupCsvRows, csvWriter)
def downloadFile(self, url, downloadPath, retry=0):
print url
self.notifyProduct.emit('<b>File URL: %s.</b>' % url)
try:
socket.setdefaulttimeout(10)
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),