本文整理汇总了Python中utils.Regex.Regex.getSearchedDataGroups方法的典型用法代码示例。如果您正苦于以下问题:Python Regex.getSearchedDataGroups方法的具体用法?Python Regex.getSearchedDataGroups怎么用?Python Regex.getSearchedDataGroups使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类utils.Regex.Regex
的用法示例。
在下文中一共展示了Regex.getSearchedDataGroups方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: NisbetProduct
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedDataGroups [as 别名]
#.........这里部分代码省略.........
if self.isExiting:
return
self.scrapProductData.emit("<b>Category 3 URL: </b>%s" % url)
self.logger.debug("== Category 3 URL [" + url + "] ==")
data = self.spider.fetchData(url)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
category4Chunks = self.regex.getSearchedData('(?i)<ul class="topCat clear-fix">(.*?)</ul>', data)
if category4Chunks:
category4Chunk = self.regex.getAllSearchedData('(?i)<a href="([^"]*)">([^<]*)<', category4Chunks)
if category4Chunk:
for category4Data in category4Chunk:
category4Url = self.mainUrl + category4Data[0]
self.scrapCategory4Data(category4Url, category1, category2, category3, category4Data[1])
def scrapCategory4Data(self, url, category1, category2, category3, category4):
if self.isExiting:
return
self.scrapProductData.emit("<b>Category 4 URL: </b>%s" % url)
self.logger.debug("== Category 4 URL [" + url + "] ==")
data = self.spider.fetchData(url)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
categoryChunk = self.regex.getAllSearchedData(
'(?i)<div class="product-list-row clear-after">(.*?)</fieldset>', data
)
if categoryChunk:
for categoryData in categoryChunk:
if self.isExiting:
return
productInfo = self.regex.getSearchedDataGroups(
'(?i)<h3 class="product-name"> <a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryData
)
productUrl = self.mainUrl + productInfo.group(1)
productName = productInfo.group(2)
if productUrl not in self.dupCsvRows:
self.dupCsvRows.append(productUrl)
else:
self.scrapProductData.emit(
"<font color=green><b>Already exists this item in csv Skip it</b></font>"
)
self.logger.debug("========= Already exists this item Skip it ===========")
return
productImageInfo = self.regex.getSearchedDataGroups(
'(?i)<img class="primaryImage" src="([^"]*)" alt="([^"]*)"', categoryData
)
image = self.regex.replaceData("(?i)medium", "xlarge", str(productImageInfo.group(1)))
productImageUrl = self.mainUrl + image
productImage = self.regex.getSearchedData("(?i)/([a-zA-Z0-9-_.]*)$", image)
self.utils.downloadFile(productImageUrl, "images/" + productImage)
productCode = productImageInfo.group(2)
productTechSpecs = self.regex.getSearchedData(
'(?i)<p class="description">([^<]*)</p>', categoryData
)
brandName = self.regex.getSearchedData(
'(?i)<img class="brand-image" src="[^"]*" alt="([^"]*)"', categoryData
)
price = self.regex.getSearchedData(
'(?i)<div class="reduced-price"> <span class="[^"]*">([^<]*)</span>', categoryData
)
if price:
示例2: MyLinkedInMembers
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedDataGroups [as 别名]
class MyLinkedInMembers(QThread):
notifyLinkedIn = pyqtSignal(object)
notifyMembers = pyqtSignal(object)
cookieL = pyqtSignal(object)
def __init__(self, spider, url, pageRange=None):
QThread.__init__(self)
# self.spider = Spider()
self.spider = spider
self.regex = Regex()
self.url = url
self.startPage = None
self.endPage = None
if self.regex.isFoundPattern('(?i)(\d+)-(\d+)', str(pageRange).strip()):
pageRangeFormat = self.regex.getSearchedDataGroups('(?i)(\d+)-(\d+)', str(pageRange).strip())
self.startPage = int(pageRangeFormat.group(1))
self.endPage = int(pageRangeFormat.group(2))
elif self.regex.isFoundPattern('(?i)(\d+)', str(pageRange).strip()):
pageRangeFormat = self.regex.getSearchedDataGroups('(?i)(\d+)', str(pageRange).strip())
self.startPage = int(pageRangeFormat.group(1))
self.endPage = self.startPage
def run(self):
self.getMembers(self.url)
self.notifyLinkedIn.emit('<font color=red><b>Finish scraping members.<b></font>')
def getMembers(self, url, pageNumber=0):
print 'Members URL: ' + url
self.notifyLinkedIn.emit('<font color=green><b>Start Scraping All Members.<b></font>')
self.notifyLinkedIn.emit('<b>Wait For 15 seconds Break...<b>')
time.sleep(15)
self.notifyLinkedIn.emit('<b>15 seconds Break Finish.<b>')
groupData = self.spider.fetchData(str(url).replace('&', '&'))
groupData = self.regex.reduceNewLine(groupData)
groupData = self.regex.reduceBlankSpace(groupData)
print groupData
print 'page number: ' + str(pageNumber)
if pageNumber > 0:
harvestedMembers = []
allMembers = self.regex.getAllSearchedData('(?i)<li class="member" id="member-[^"]*"[^>]*?>(.*?)</div>',
groupData)
for members in allMembers:
memberId = self.regex.getSearchedData('(?i)data-li-memberId="([^"]*)"', members)
memberName = self.regex.getSearchedData('(?i)data-li-fullName="([^"]*)"', members)
memberTitle = self.regex.getSearchedData('(?i)<p class="headline">([^<]*?)</p>', members)
memberTitle = self.regex.replaceData('(?i)&', '&', memberTitle)
harvestedMembers.append((memberId, memberName, memberTitle))
self.notifyLinkedIn.emit('<b>Member ID: </b>%s <b>Member Name: </b>%s' % (memberId, memberName + ' (' + memberTitle + ')'))
# members = self.regex.getAllSearchedData(
# '(?i)class="send-message" data-li-memberId="([^"]*)" data-li-fullName="([^"]*)"', groupData)
# print members
self.notifyMembers.emit(harvestedMembers)
# for member in members:
# print member
# self.notifyLinkedIn.emit('<b>Member Name: </b>%s <b>Member ID: </b>%s' % (member[1], member[0]))
urlNext = self.regex.getSearchedData('(?i)<a href="([^"]*)"[^>]*?>\s*?<strong>\s*?next', groupData)
if urlNext and len(urlNext) > 0:
# nextP = int(self.regex.getSearchedData('(?i).*?(\d+)$', urlNext.strip()))
urlNext = self.regex.replaceData('(?i)&', '&', urlNext)
urlNext = self.regex.replaceData('(?i)split_page=\d+', 'split_page=', urlNext)
pageNumber += 1
if self.startPage <= pageNumber <= self.endPage:
self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>')
time.sleep(15)
print 'sleep 15 s'
self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>')
self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber)
elif pageNumber < self.startPage:
pageNumber = self.startPage
self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>')
time.sleep(15)
print 'page number less 0 sleep'
self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>')
self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber)
if self.startPage is None and self.endPage is None:
pageNumber += 1
self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>')
time.sleep(15)
print 'page number less 0 sleep'
self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>')
self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber)
示例3: __init__
# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedDataGroups [as 别名]
class Nisbets:
def __init__(self):
self.logger = LogManager(__name__)
self.spider = Spider()
self.regex = Regex()
self.csvWriter = Csv('nisbets.csv')
self.mainUrl = 'http://www.nisbets.co.uk'
csvHeaderList = ['Category', 'Product Image Url', 'Product Code', 'Product Name', 'Price']
self.csvWriter.writeCsvRow(csvHeaderList)
def scrapData(self):
self.logger.debug('===== URL [' + self.mainUrl + '] =====')
data = self.spider.fetchData(self.mainUrl)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
data = self.regex.getSearchedData('(?i)<div class="cms-left-nav-category">(.*?)</ul>', data)
if data:
links = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"', data)
if links:
for link in links:
self.scrapLinkData(self.mainUrl + link)
def scrapLinkData(self, link):
self.logger.debug('== Link URL [' + link + '] ==')
data = self.spider.fetchData(link)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
data = self.regex.getSearchedData('(?i)<h3>Brand</h3> <ul class="subCat02 clear-fix">(.*?)</ul>', data)
if data:
links = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"', data)
if links:
for link in links:
self.scrapInfo(self.mainUrl + link)
def scrapInfo(self, link):
self.logger.debug('= Info URL [' + link + '] =')
data = self.spider.fetchData(link)
if data:
data = self.regex.reduceNewLine(data)
data = self.regex.reduceBlankSpace(data)
category = self.regex.getSearchedData('(?i)<li><h3>Category</h3></li> <li class="remCont"> <span class="block">([^<]*)</span>', data)
allInfo = self.regex.getAllSearchedData('(?i)<div class="product-list-row clear-after">(.*?)</fieldset>', data)
if allInfo:
for info in allInfo:
csvData = []
csvData.append(category)
grpData = self.regex.getSearchedDataGroups('(?i)<img class="primaryImage" src="([^"]*)" alt="([^"]*)" />', info)
if grpData.group(1):
imageUrl = grpData.group(1)
imageUrl = self.regex.replaceData('(?i)medium', 'xlarge', imageUrl)
csvData.append(self.mainUrl + imageUrl)
else:
csvData.append('')
csvData.append(grpData.group(2))
name = self.regex.getSearchedData('(?i)<h3 class="product-name"> <a href="[^"]*">([^<]*)</a>', info)
csvData.append(name)
price = self.regex.getSearchedData(u'(?i)<div class="reduced-price"> <span class="bold">([^<]*)</span>', info)
csvData.append(price.strip()[1:])
self.logger.debug('Scraped Data ' + str(csvData))
self.csvWriter.writeCsvRow(csvData)