当前位置: 首页>>代码示例>>Python>>正文


Python Regex.getSearchedData方法代码示例

本文整理汇总了Python中utils.Regex.Regex.getSearchedData方法的典型用法代码示例。如果您正苦于以下问题:Python Regex.getSearchedData方法的具体用法?Python Regex.getSearchedData怎么用?Python Regex.getSearchedData使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在utils.Regex.Regex的用法示例。


在下文中一共展示了Regex.getSearchedData方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: NisbetProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class NisbetProduct(QtCore.QThread):
    scrapProductData = QtCore.pyqtSignal(object)
    stopThread = QtCore.pyqtSignal(int)

    def __init__(self):
        QtCore.QThread.__init__(self)
        self.isExiting = False
        self.totalProducts = 0
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0)
        self.csvWriter = Csv('nisbets.csv')
        self.mainUrl = 'http://www.nisbets.co.uk'
        csvHeaderList = ['URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand',
                         'Product Price', 'Product Short Description',
                         'Product Long Description', 'Image File Name', 'User Manual File Name',
                         'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1',
                         'Category2', 'Category3',
                         'Category4']
        if 'URL' not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(csvHeaderList)
            self.dupCsvRows.append(csvHeaderList[0])

        self.utils = Utils()

    def run(self):
        self.scrapData()

    def stop(self):
        self.isExiting = True

    def scrapData(self):
        if self.isExiting: return
        self.scrapProductData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl)
        self.logger.debug('===== URL [' + self.mainUrl + '] =====')
        data = self.spider.fetchData(self.mainUrl)
        if data and len(str(data).strip()) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            category1Chunk = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data)
            if category1Chunk and len(str(category1Chunk).strip()) > 0:
                i = 0
                for category1Data in category1Chunk:
                    category1 = self.regex.getSearchedData('(?i)<a href="[^"]*">([^<]*)</a>', category1Data)
                    category2Chunk = self.regex.getAllSearchedData('(?i)<li><a href="([^"]*)">([^<]*)</a>',
                                                                   category1Data)
                    if category2Chunk and len(str(category2Chunk).strip()) > 0:
                        for category2Data in category2Chunk:
                            try:
                                self.scrapCategory2Data(self.mainUrl + category2Data[0], category1, category2Data[1])
                            except Exception, x:
                                self.logger.error(x)
        self.scrapProductData.emit('<font color=red><b>Finish Scraping Product data from %s</b></font>' % self.mainUrl)
开发者ID:rabbicse,项目名称:Python-Nisbets-Scrapper,代码行数:57,代码来源:NisbetProduct.py

示例2: downloadFile

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
    def downloadFile(self, url, downloadPath, proxyHandler=None, notifier=None, retry=0):
        try:
            if os.path.exists(downloadPath) and os.path.getsize(downloadPath):
                if notifier is not None:
                    notifier.emit('<font color=red><b>Image file already exists. Skip downloading file.</b></font>')
                return
            notifier.emit(('<font color=blue><b>Image URL: %s</b></font>' % url))
            regex = Regex()
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                                          urllib2.HTTPHandler(debuglevel=0),
                                          urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                config.USER_AGENT,
                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                ('Connection', 'keep-alive')]

            if proxyHandler is not None:
                opener.add_handler(proxyHandler)
            resp = urllib2.urlopen(url, timeout=30)
            contentLength = resp.info()['Content-Length']
            contentLength = regex.getSearchedData('(?i)^(\d+)', contentLength)
            totalSize = float(contentLength)
            directory = os.path.dirname(downloadPath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            dl_file = open(downloadPath, 'wb')
            currentSize = 0
            CHUNK_SIZE = 32768
            while True:
                data = resp.read(CHUNK_SIZE)
                if not data:
                    break
                currentSize += len(data)
                dl_file.write(data)
                msg = '=====> ' + str(round(float(currentSize * 100) / totalSize, 2)) + \
                      '% of ' + str(totalSize / (1024)) + ' KB'
                print('=====> ' + str(round(float(currentSize * 100) / totalSize, 2)) + '% of ' + str(
                    totalSize) + ' bytes')

                if notifier is not None:
                    notifier.emit('<font color=blue><b>%s</b></font>' % msg)
                if currentSize >= totalSize:
                    dl_file.close()
                    return True
        except Exception, x:
            print x
            notifier.emit(('<font color=red><b>Error Download Image URL: %s</b></font>' % url))
            if retry < 1:
                notifier.emit('<font color=black><b>Will retry after 5 seconds.</b></font>')
                time.sleep(5)
                notifier.emit('<font color=black><b>Retry...</b></font>')
                self.downloadFile(url, downloadPath, proxyHandler, notifier, retry + 1)
            else:
                notifier.emit('<font color=red><b>Failed to download after maximum retry.</b></font>')
开发者ID:tuly,项目名称:Python-Ebay-Details,代码行数:56,代码来源:Spider.py

示例3: AmazonScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class AmazonScrapper():
    def __init__(self, url):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.url = url
        self.base_product_url = 'http://www.amazon.com/dp/'
        self.base_image_url = 'http://ecx.images-amazon.com/images/I/'
        self.csvWriter = Csv('amazon.csv')
        csvDataHeader = ['URL', 'HTML Path', 'Image URLS']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def scrapData(self):
        try:
            host = ('Host', 'www.amazon.com')
            data = self.spider.fetchData(self.url, host=host)
            if data:
                data = self.regex.reduceNewLine(data)
                data = self.regex.reduceBlankSpace(data)
                searchParams = self.regex.getSearchedData('(?i)var searchParams = {([^\}]*)}', data)
                searchParams = searchParams.split(',')
                seller = ''
                marketPlaceId = ''
                useMYI = ''
                for searchParam in searchParams:
                    searchParam = self.regex.reduceBlankSpace(searchParam)
                    searchParam = self.regex.replaceData('\'', '', searchParam)

                    if searchParam.startswith('seller'):
                        seller = searchParam.split(':')[1].strip()
                        seller = seller.decode('string-escape')
                    if searchParam.startswith('marketplaceID'):
                        marketPlaceId = searchParam.split(':')[1].strip()
                        marketPlaceId = marketPlaceId.decode('string-escape')
                    if searchParam.startswith('useMYI'):
                        useMYI = searchParam.split(':')[1].strip()
                        useMYI = useMYI.decode('string-escape')
                params = {'seller': seller,
                          'marketPlaceId': marketPlaceId,
                          'useMYI': useMYI}
                ajax_url = 'http://www.amazon.com/gp/aag/ajax/productWidget.html'
                self.scrapAjaxPage(ajax_url, params, host)
        except Exception, x:
            print x
开发者ID:rabbicse,项目名称:python-spider-bs4,代码行数:47,代码来源:AmazonScrapper.py

示例4: downloadFile

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
    def downloadFile(self, url, downloadPath, proxyHandler=None):
        try:
            regex = Regex()
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                                          urllib2.HTTPHandler(debuglevel=0),
                                          urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                config.USER_AGENT,
                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                ('Connection', 'keep-alive')]

            if proxyHandler is not None:
                opener.add_handler(proxyHandler)
            resp = urllib2.urlopen(url, timeout=30)
            contentLength = resp.info()['Content-Length']
            contentLength = regex.getSearchedData('(?i)^(\d+)', contentLength)
            totalSize = float(contentLength)
            directory = os.path.dirname(downloadPath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            dl_file = open(downloadPath, 'wb')
            currentSize = 0
            CHUNK_SIZE = 32768
            while True:
                data = resp.read(CHUNK_SIZE)
                if not data:
                    break
                currentSize += len(data)
                dl_file.write(data)

                print('============> ' + str(round(float(currentSize * 100) / totalSize, 2)) + '% of ' + str(
                    totalSize) + ' bytes')
                if currentSize >= totalSize:
                    dl_file.close()
                    return True
        except Exception, x:
            print x
开发者ID:tuly,项目名称:Python-Internet-Harvesting,代码行数:39,代码来源:Spider.py

示例5: CsCat

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class CsCat(QThread):
    notifyCategory = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('cs_cat.csv')
        self.csvWriter = Csv('cs_cat.csv')
        dupFilterCsvReader = Csv()
        self.dupFilterCsvRows = dupFilterCsvReader.readCsvRow('filter_cat' + '.csv')
        self.csvW = Csv('filter_cat' + '.csv')
        self.mainUrl = 'http://www.cs-catering-equipment.co.uk/'
        self.totalCategory = 0

    def run(self):
        self.scrapCategories()
        self.notifyCategory.emit('<font color=red><b>Finished Scraping All Categories.</b></font>')

    def scrapCategories(self):
#        self.scrapFinalCategory('http://www.cs-catering-equipment.co.uk/kitchen-equipment/food-prep-machines/chocolate-fountains', '', '')
#        return
        self.notifyCategory.emit('<b>Start scraping Category.</b>')
        self.notifyCategory.emit('<font color=green><b>Main URL: %s</b></font>' % self.mainUrl)
        data = self.spider.fetchData(self.mainUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            #        <a href="http://www.cs-catering-equipment.co.uk/kitchen-equipment" class="level-top" title="Kitchen Equipment"
            categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)" class="level-top" title="([^"]*)"', data)
            if categories and len(categories) > 0:
                self.totalCategory += len(categories)
                self.notifyCategory.emit(
                    '<font color=green><b>Total Category Found [%s]</b></font>' % unicode(self.totalCategory))
                for category in categories:
                    homeCategoryName = 'Home'
                    categoryName = unicode(category[1]).strip()
                    self.scrapCategory(str(category[0]).strip(), homeCategoryName, categoryName)

    def scrapCategory(self, url, rootCategoryName, categoryName):
        self.notifyCategory.emit('<font color=green><b>Start scraping URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            print 'category 1'
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            self.filterCategory(data, categoryName)
            categoryDesc = self.regex.getSearchedData('(?i)<div class="category-description std">([^<]*)</div>', data)
            if categoryDesc and len(categoryDesc) > 0:
                categoryDesc = unicode(categoryDesc).strip()
            csvData = [rootCategoryName, categoryName, categoryDesc]
            if csvData not in self.dupCsvRows:
                self.notifyCategory.emit('<b>Scraped Data: %s</b>' % unicode(csvData))
                self.csvWriter.writeCsvRow(csvData)
                self.dupCsvRows.append(csvData)
            else:
                self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)

            subCategories = self.regex.getAllSearchedData(
                '(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
            if subCategories and len(subCategories) > 0:
                self.totalCategory += len(subCategories)
                self.notifyCategory.emit(
                    '<font color=green><b>Total Category Found [%s]</b></font>' % unicode(self.totalCategory))
                for subCategory in subCategories:
                    print subCategory
                    self.scrapSubCategory(subCategory[0], categoryName, subCategory[1])

    def scrapSubCategory(self, url, rootCategoryName, categoryName):
        self.notifyCategory.emit('<font color=green><b>Start scraping URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            self.filterCategory(data, categoryName)
            categoryDesc = self.regex.getSearchedData('(?i)<div class="category-description std">([^<]*)</div>', data)
            categoryDesc = unicode(categoryDesc).strip()
            csvData = [rootCategoryName, categoryName, categoryDesc]
            if csvData not in self.dupCsvRows:
                self.csvWriter.writeCsvRow(csvData)
                self.dupCsvRows.append(csvData)
                self.notifyCategory.emit('<b>Scraped Data: %s</b>' % unicode(csvData))
            else:
                self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)

            subCategories = self.regex.getAllSearchedData(
                '(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
            if subCategories and len(subCategories) > 0:
                self.totalCategory += len(subCategories)
                self.notifyCategory.emit(
                    '<font color=green><b>Total Category Found [%s]</b></font>' % unicode(self.totalCategory))
                for subCategory in subCategories:
                    self.scrapFinalCategory(subCategory[0], categoryName, subCategory[1])

    def scrapFinalCategory(self, url, rootCategoryName, categoryName):
        self.notifyCategory.emit('<font color=green><b>Start scraping URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-Bertos-Scrapper,代码行数:103,代码来源:CsCat.py

示例6: MyLinkedIn

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class MyLinkedIn(QThread):
    notifyLinkedIn = pyqtSignal(object)
    notifyMember = pyqtSignal(object)
    cookieL = pyqtSignal(object)

    def __init__(self, username, password):
        QThread.__init__(self)
        self.spider = Spider()
        self.regex = Regex()
        self.username = username
        self.password = password

    def run(self):
        if self.login():
            self.getAllGroups()

    def login(self):
        print "login start"
        self.notifyLinkedIn.emit("<b>Trying to login. Please wait...</b>")
        loginPageData = self.spider.fetchData("https://www.linkedin.com/uas/login?goback=&trk=hb_signin")
        loginPageData = self.regex.reduceNewLine(loginPageData)
        loginPageData = self.regex.reduceBlankSpace(loginPageData)

        ## <input type="hidden" name="session_redirect" value="" id="session_redirect-login"><input type="hidden" name="csrfToken" value="ajax:9073845200579364133" id="csrfToken-login"><input type="hidden" name="sourceAlias" value="0_7r5yezRXCiA_H0CRD8sf6DhOjTKUNps5xGTqeX8EEoi" id="sourceAlias-login">
        self.sessionRedirect = self.regex.getSearchedData(
            '(?i)<input type="hidden" name="session_redirect" value="([^"]*)"', loginPageData
        )
        self.token = self.regex.getSearchedData(
            '(?i)<input type="hidden" name="csrfToken" value="([^"]*)"', loginPageData
        )
        self.alias = self.regex.getSearchedData(
            '(?i)<input type="hidden" name="sourceAlias" value="([^"]*)"', loginPageData
        )

        loginParam = {
            "csrfToken": self.token,
            "isJsEnabled": "true",
            "session_key": self.username,
            "session_password": self.password,
            #                      'session_key': '[email protected]',
            #                      'session_password': 'ubuntu36',
            "session_redirect": self.sessionRedirect,
            "signin": "Sign In",
            "sourceAlias": self.alias,
            "source_app": "",
        }
        print loginParam
        print "start login"
        time.sleep(5)
        loginData = self.spider.login("https://www.linkedin.com/uas/login-submit", loginParam)
        loginData = self.regex.reduceNewLine(loginData)
        loginData = self.regex.reduceBlankSpace(loginData)
        #        print loginData
        isLoggedIn = self.regex.isFoundPattern('(?i)<li class="signout">', loginData)

        if isLoggedIn:
            self.notifyLinkedIn.emit("<font color=green><b>Successfully Logged In.</b></font>")
            print "login success"
            self.cookieL.emit(self.spider)
            return True
        else:
            self.notifyLinkedIn.emit(
                "<font color=red><b>Something wrong with logging in. Please try again or check manually with this username/password</b></font>"
            )
            return False

    def getAllGroups(self):
        print "start groups"
        self.notifyLinkedIn.emit("<font color=green><b>Start Scraping All Groups.</b></font>")
        self.notifyLinkedIn.emit("<b>Wait for 15 second break...</b>")
        time.sleep(15)
        self.notifyLinkedIn.emit("<b>15 second break finish!!!</b>")
        self.notifyLinkedIn.emit("<font color=green><b>Fetching data for scraping your groups.</b></font>")
        groupsUrl = "http://www.linkedin.com/myGroups?trk=hb_side_grps_top"
        groupsData = self.spider.fetchData(groupsUrl)
        self.notifyLinkedIn.emit("<font color=green><b>Data fetching complete for scraping your groups.</b></font>")
        if groupsData is not None and len(groupsData) > 0:
            print "starting groups"
            groupsData = self.regex.reduceNewLine(groupsData)
            groupsData = self.regex.reduceBlankSpace(groupsData)
            print groupsData
            ## <a href="/groups?gid=72881&amp;trk=myg_ugrp_ovr" class="private" title="This group is members only">MySQL Professionals</a>
            groupInfo = self.regex.getAllSearchedData('(?i)<a href="(/groups\?gid=[^"]*)"[^>]*>([^<]*)</a>', groupsData)
            if groupInfo is not None and len(groupInfo) > 0:
                members = []
                for group in groupInfo:
                    groupUrl = "http://www.linkedin.com" + str(group[0])
                    groupName = str(group[1])
                    self.notifyLinkedIn.emit("<b>Group Name: </b>%s <b>URL: </b>%s" % (groupName, groupUrl))
                    #                    http://www.linkedin.com/groups?members=&gid=65688&trk=anet_ug_memb
                    gid = self.regex.getSearchedData("(?i)gid=(\d+)", group[0])
                    print gid
                    groupUrl = "http://www.linkedin.com/groups?members=&gid=" + gid + "&trk=anet_ug_memb"
                    members.append((groupName, groupUrl))
                self.notifyMember.emit(members)
            self.notifyLinkedIn.emit("<font color=red><b>Finish Scraping All Groups.</b></font>")
开发者ID:rabbicse,项目名称:Python-LinkedIn-Scrapper,代码行数:98,代码来源:MyLinkedIn.py

示例7: NisbetCat

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class NisbetCat(QtCore.QThread):
    scrapCategoryData = QtCore.pyqtSignal(object)
    stopThread = QtCore.pyqtSignal(int)

    def __init__(self):
        QtCore.QThread.__init__(self)
        self.isExiting = False
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('nisbetCat.csv')
        self.csvWriter = Csv('nisbetCat.csv')
        self.mainUrl = 'http://www.nisbets.co.uk'
        csvHeaderList = ['Parent Category', 'Category Name', 'Category Description']
        if csvHeaderList not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(csvHeaderList)
            self.dupCsvRows.append(csvHeaderList)

    def run(self):
        self.scrapData()

    def stop(self):
        self.isExiting = True

    def scrapData(self):
        if self.isExiting: return
        self.scrapCategoryData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl)
        self.logger.debug('===== URL [' + self.mainUrl + '] =====')
        data = self.spider.fetchData(self.mainUrl)
        if data:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            links = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+"> <a href="([^"]*)">([^<]*)</a>', data)
            if links:
                for link in links:
                    self.scrapCategoryData.emit('<b>Link URL: </b>%s' % (self.mainUrl + link[0]))
                    self.logger.debug('===Link URL [' + self.mainUrl + link[0] + '] ===')
                    csvData = ['Home']
                    category = link[1]
                    csvData.append(category)

                    linkInfo = self.spider.fetchData(self.mainUrl + link[0])
                    if linkInfo:
                        linkInfo = self.regex.reduceNewLine(linkInfo)
                        linkInfo = self.regex.reduceBlankSpace(linkInfo)
                        csvData.append(
                            self.regex.getSearchedData('(?i)<p class="br5px padding10 mb0 mt10">([^<]*)</p>', linkInfo))
                        self.logger.debug('Category ' + str(csvData))
                        if csvData not in self.dupCsvRows:
                            self.csvWriter.writeCsvRow(csvData)
                            self.dupCsvRows.append(csvData)
                            self.scrapCategoryData.emit('<b>Scraped Data: </b>%s<br />' % str(csvData))
                        else:
                            self.scrapCategoryData.emit(
                                '<font color=green><b>Already Scrapped Skip This Category</b></font>')

                        ## After write first cat data
                        subUrlsChunk = self.regex.getSearchedData('(?i)<ul class="topCat clear-fix">(.*?)</ul>',
                            linkInfo)
                        if subUrlsChunk:
                            subUrls = self.regex.getAllSearchedData('(?i)<a href="([^"]*)">([^<]*)<span', subUrlsChunk)
                            if subUrls:
                                for subUrl in subUrls:
                                    self.scrapSubCat(self.mainUrl + subUrl[0], category, subUrl[1])
        self.scrapCategoryData.emit(
            '<font color=red><b>Finish Scraping Category data from %s</b></font>' % self.mainUrl)

    def scrapSubCat(self, url, parentCat, category):
        if self.isExiting: return
        self.scrapCategoryData.emit('<b>Link URL: </b>%s' % url)
        self.logger.debug('== Sub URL [' + url + '] ==')
        data = self.spider.fetchData(url)
        if data:
            csvData = [parentCat, category]
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            csvData.append(self.regex.getSearchedData('(?i)<p class="br5px padding10 mb0 mt10">([^<]*)</p>', data))
            self.logger.debug('Sub Category ' + str(csvData))
            if csvData not in self.dupCsvRows:
                self.csvWriter.writeCsvRow(csvData)
                self.dupCsvRows.append(csvData)
                self.scrapCategoryData.emit('<b>Scraped Data: </b>%s<br />' % str(csvData))
            else:
                self.scrapCategoryData.emit('<font color=green><b>Already Scrapped Skip This Category</b></font>')

            ## After write first cat data
            subUrlsChunk = self.regex.getSearchedData('(?i)<ul class="topCat clear-fix">(.*?)</ul>', data)
            if subUrlsChunk:
                subUrls = self.regex.getAllSearchedData('(?i)<a href="([^"]*)">([^<]*)<span', subUrlsChunk)
                if subUrls:
                    for subUrl in subUrls:
                        self.scrapSubSubCat(self.mainUrl + subUrl[0], category, subUrl[1])

    def scrapSubSubCat(self, url, parentCat, category):
        if self.isExiting: return
        self.scrapCategoryData.emit('<b>Link URL: </b>%s' % url)
        self.logger.debug('== SUb SUb URL [' + url + '] ==')
        data = self.spider.fetchData(url)
        if data:
#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-Bertos-Scrapper,代码行数:103,代码来源:NisbetCat.py

示例8: EbagsScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class EbagsScrapper():
    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.mainUrl = 'http://www.ebags.com'
        self.url = 'http://www.ebags.com/brands'

    def scrapData(self):
        data = self.spider.fetchData(self.url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            allBrandLinks = []
            soup = BeautifulSoup(data)
            brandChunks = soup.find_all('div', class_='brandList4Col')
            if brandChunks and len(brandChunks) > 0:
                for brandChunk in brandChunks:
                    brandLinks = brandChunk.find_all('a')
                    if brandLinks and len(brandLinks) > 0:
                        for brandLink in brandLinks:
                            brandUrl = brandLink.get('href')
                            allBrandLinks.append(self.mainUrl + brandUrl)

            print 'Total brands found: ', str(len(allBrandLinks))
            for brandUrl in allBrandLinks:
                self.scrapBrandData(brandUrl)

    def scrapBrandData(self, url, page=1):
        brandUrl = url if page is 1 else url + '?page=' + str(page)
        print 'Brand URL: ', brandUrl
        data = self.spider.fetchData(brandUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            soup = BeautifulSoup(data)
            productLinks = soup.find_all('div', class_='itemProductName')
            if productLinks and len(productLinks) > 0:
                for productLink in productLinks:
                    productUrl = productLink.find('a').get('href')
                    productDetailUrl = self.mainUrl + productUrl
                    print 'Product detail url: ', productDetailUrl
                    fileName = self.regex.getSearchedData('(?i)^.*?(\d+)$', productUrl) + '.html'
                    print 'File Name: ', fileName
                    self.scrapProductDetail(self.mainUrl + productUrl, fileName)

            ## Check if any next page
            if soup.find('a', class_='pagingNext') is not None:
                nextLink = soup.find('a', class_='pagingNext').get('href')
                print nextLink
                if nextLink is not None and len(nextLink) > 0:
                    print 'Pagination found...'
                    return self.scrapBrandData(url, page + 1)

    def scrapProductDetail(self, url, filename):
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            directory = os.path.dirname('./products/' + filename)
            if not os.path.exists(directory):
                os.makedirs(directory)
            print 'File saving path: ', os.path.abspath('./products/' + filename)
            dl_file = open('./products/' + filename, 'wb')
            dl_file.write(data)
            dl_file.close()
开发者ID:rabbicse,项目名称:python-spider-bs4,代码行数:69,代码来源:EbagsScrapper.py

示例9: YoutubeScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class YoutubeScrapper(object):
    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()


    def scrapVideoDownloadUrl(self, url, filename=None):
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            title = self.scrapTitle(url)
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data)
            dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk)
            dlUrlParts = dlUrlChunk.split(',')
            sig = ''
            video = ''
            videoUrl = ''
            print dlUrlParts
            for dlUrlPart in dlUrlParts:
                dlUrlPart = urllib2.unquote(dlUrlPart)
                print dlUrlPart

                ## TODO
                if self.regex.isFoundPattern('(?i)itag=22', dlUrlPart) or self.regex.isFoundPattern('(?i)itag=18',
                                                                                                    dlUrlPart):
                    urlPart = dlUrlPart.split(' ')
                    for part in urlPart:
                        print part
                        if self.regex.isFoundPattern('(?i)sig=.*?', part):
                            sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part)

                        if self.regex.isFoundPattern('(?i)url=.*?', part):
                            video = self.regex.getSearchedData('(?i)url=(.*?)$', part)
                            print video

                    videoUrl = video + '&signature=' + sig
                    self.downloadDir = './natok.mp4'

                    print 'Video URL= ' + videoUrl
                    print self.downloadDir
                    break

            # dlPath = './natok.mp4' if filename is None else filename
            fname = self.regex.replaceData('\s+', '_', title)
            dlPath = './' + fname  + '.mp4' if filename is None else filename
            print dlPath
            print '\n\n'
            if self.downloadFile(videoUrl, dlPath) is True:
                print 'Download complete'
        else:
            print 'No data found.'

    def scrapTitle(self, url):
        #        https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml
        xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml'
        data = self.spider.fetchData(xmlUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            print data
            return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data)

    def downloadFile(self, url, downloadPath, retry=0):
        try:
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                                          urllib2.HTTPHandler(debuglevel=0),
                                          urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'),
                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                ('Connection', 'keep-alive')]
            #            resp = opener.open(url, timeout=10)
            resp = urllib2.urlopen(url, timeout=60)
            print 'ok'

            print resp.info()
            contentLength = resp.info()['Content-Length']
            contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength)
            totalSize = float(contentLength)
            directory = os.path.dirname(downloadPath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            currentSize = 0
            dl_file = open(downloadPath, 'ab')
            try:
                if os.path.getsize(downloadPath):
                    start = os.path.getsize(downloadPath)
                    currentSize = start
                    opener.addheaders.append(('Range', 'bytes=%s-' % (start)))
            except Exception, x:
                print x

            res = opener.open(url, timeout=60)
            CHUNK_SIZE = 256 * 1024
            while True:
                data = res.read(CHUNK_SIZE)
                # data = resp.read(CHUNK_SIZE)
#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-YTDownloader,代码行数:103,代码来源:YtDownloader.py

示例10: MainForm

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]

#.........这里部分代码省略.........
        widget = QWidget()
        widget.setLayout(layout)

        self.setCentralWidget(widget)
        self.statusBar().showMessage(QString("Application Started...."), 500)
        self.setWindowTitle('PDF Batch Saver')
        self.setWindowFlags(Qt.WindowCloseButtonHint | Qt.WindowMinimizeButtonHint)
        screen = QDesktopWidget().screenGeometry()
        #        self.setFixedSize((screen.width() / 2) + 150, (screen.height() / 2) + 150)
        self.resize((screen.width() / 2) + 150, (screen.height() / 2) + 150)

    def printPdfAction(self):
        if self.fileName is not None and self.fileDir is not None and self.alreadyClickedA is False and self.typeName == 'A':
            self.webToPdf = WebPageToPdf()
            self.webToPdf.threadPdfStatusBar.connect(self.showStatus)
            self.webToPdf.threadPdfWritingStatus.connect(self.appendStatus)
            self.webToPdf.threadPdfWritingDone.connect(self.pdfGenFinished)
            f = open(self.fileName, 'rb')
            self.lists = f.readlines()
            f.close()
            self.totalUrlA = len(self.lists)
            self.alreadyClickedA = True
            self.pdfGenFinished()
        elif self.fileNameB is not None and self.fileDirB is not None and self.alreadyClickedB is False and self.typeName == 'B':
            self.webToPdfB = WebPageToPdf()
            self.webToPdfB.threadPdfStatusBar.connect(self.showStatus)
            self.webToPdfB.threadPdfWritingStatus.connect(self.appendStatus)
            self.webToPdfB.threadPdfWritingDone.connect(self.pdfGenFinishedB)
            f = open(self.fileNameB, 'rb')
            self.listsB = f.readlines()
            f.close()
            pdfFiles = [f for f in os.listdir(self.fileDirB) if f.endswith('.pdf')]
            if len(pdfFiles) > 0:
                self.pdfCounterB = int(self.regex.getSearchedData('(?i)^(\d+)_', pdfFiles[-1])) + 1
            self.totalUrlB = len(self.listsB)
            self.alreadyClickedB = True
            self.startTime = time.clock()
            self.pdfGenFinishedB()
        else:
            QMessageBox.warning(None, 'Warning', 'Please Select your URL List and PDF writing Path.')

    def pdfGenFinished(self):
        if self.lists is not None and len(self.lists) > 0:
            self.currentUrlA += 1
            url = self.lists.pop(0)
            self.lineEditWebAddress.setText(url)
            url = url.strip()
            self.labelProStatusA.setText(
                '<font color="green" size=4><b>For grouping "A": <u> %s </u> total items in the batch, processing <u> %s </u> out of <u> %s </u></b></font>' % (
                    str(
                        self.totalUrlA), str(self.currentUrlA), str(self.totalUrlA)))

            pdfFile = str(url).split('/')[-1]
            print 'pdf file : ' + pdfFile
            pdfFile = self.regex.getSearchedData('(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', pdfFile)
            pdfFiles = [f for f in os.listdir(self.fileDir) if f.endswith('.pdf')]
            finalPdfFile = ''
            i = 2
            for file in pdfFiles:
                if self.regex.isFoundPattern('(?i)' + pdfFile, file):
                    index = self.regex.getSearchedData('(?i)(\d+).*?$', file)
                    finalPdfFile = str(index) + '_' + str(pdfFile) + '_copy_' + str(i) + '.pdf'
                    i += 1

            if len(finalPdfFile) is 0:
                finalPdfFile = str(self.pdfCounter) + '_' + pdfFile + '.pdf'
开发者ID:rabbicse,项目名称:Python-Web2Pdf,代码行数:70,代码来源:MainWindow.py

示例11: BetrosProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class BetrosProduct(QThread):
    notifyProduct = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.mainUrl = 'http://www.bertos.com'
        self.utils = Utils()
        self.csvHeader = ['Home Category', 'Sub Category', 'Category Description', 'Category Image', 'Code',
                          'Product Code',
                          'Product Name',
                          'Product Description', 'Product Image File', 'Technical Sheet File', 'Exploded View File']
        self.totalProducts = 0

    def run(self):
        self.scrapBertos()
        self.notifyProduct.emit('<font color=red><b>Finished Scraping All products.</b></font>')

    def scrapBertos(self, retry=0):
    #        self.downloadFile('http://s900.bertos.it/download.php?file=editorcms/documentazione/schede/scheda_13722600.pdf', 'a.pdf')
    #        self.scrapSubCategory('http://s900.bertos.it/en/', '', None, None)
    #        self.scrapProducts('http://s900.bertos.it/en/pasta_cookers/', '', '', None, None)
    #        return
        self.notifyProduct.emit('<font color=green><b>Try to get all language links.</b></font>')
        self.logger.debug(self.mainUrl)
        data = self.spider.fetchData(self.mainUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)

            languages = self.regex.getAllSearchedData(
                '(?i)<div class="[^"]*"><a href="([^"]*)"\s*?class="boxalingua">([^<]*)</a>', data)
            if languages and len(languages) > 0:
                self.logger.debug('Total languages: %s' % str(len(languages)))
                self.notifyProduct.emit('<b>Total languages found[%s]</b>' % str(len(languages)))
                for language in languages:
                    self.totalProducts = 0
                    url = language[0]
                    #                    if str(language[1]).lower() != 'en':
                    #                        continue
                    urlChunk = self.spider.fetchData(url)
                    if urlChunk and len(urlChunk) > 0:
                        urlChunk = self.regex.reduceNewLine(urlChunk)
                        urlChunk = self.regex.reduceBlankSpace(urlChunk)
                        url = self.regex.getSearchedData('(?i)<a href="([^"]*)" onmouseover="vedi_po_cat\(2\)\s*?"',
                            urlChunk)
                        csvFile = str(language[1].strip()).lower() + '_' + 'bertos.csv'
                        dupCsvReader = Csv()
                        dupCsvRows = dupCsvReader.readCsvRow(csvFile)
                        csvWriter = Csv(csvFile)
                        if self.csvHeader not in dupCsvRows:
                            dupCsvRows.append(self.csvHeader)
                            csvWriter.writeCsvRow(self.csvHeader)
                        self.notifyProduct.emit(
                            '<font color=green><b>Try to get data for language [%s].</b></font>' % language[1])
                        self.scrapCategory(url, dupCsvRows, csvWriter)
                        self.notifyProduct.emit(
                            '<font color=red><b>=====  Finish scraping data for [%s] =====</b></font><br /><br />' %
                            language[1])
        else:
            if retry < 5:
                return self.scrapBertos(retry + 1)


    def scrapCategory(self, mainUrl, dupCsvRows, csvWriter):
        url = mainUrl
        self.logger.debug('Main URL: ' + url)
        self.notifyProduct.emit('<font color=green><b>Main URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            data = self.regex.reduceNbsp(data)
            self.notifyProduct.emit('<b>Try to scrap all categories.</b>')
            categoryChunk = self.regex.getSearchedData('(?i)<div id="contenuto1">(.*?)</div>\s*?</div>', data)
            if categoryChunk and len(categoryChunk) > 0:
                categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryChunk)
                if categories and len(categories) > 0:
                    self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
                    for category in categories:
                        categoryName = category[1].strip()
                        self.scrapSubCategory(str(category[0]).strip(), categoryName, dupCsvRows, csvWriter)

    def scrapSubCategory(self, url, categoryName, dupCsvRows, csvWriter):
        self.logger.debug('Category URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap subcategories for: %s</b>' % categoryName)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            subCategories = self.regex.getAllSearchedData('(?i)<li\s*?><a href="([^"]*)" title="([^"]*)"', data)
            if subCategories and len(subCategories) > 0:
                self.notifyProduct.emit(
                    '<font color=green><b>Total subcategories found %s.</b></font>' % str(len(subCategories)))
                for subCategory in subCategories:
                    subCategoryName = subCategory[1].strip()
                    self.scrapProducts(subCategory[0].strip(), categoryName, subCategoryName, dupCsvRows, csvWriter)

#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-Bertos-Scrapper,代码行数:103,代码来源:BetrosProduct.py

示例12: Scrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class Scrapper(QThread):
    notifyScrapper = pyqtSignal(object)
    isFinished = False

    def __init__(self, urllist):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        print urllist
        self.urllist = urllist
        self.csv = Csv('scrapper.csv')

    def run(self):
        self.scrapData()
        self.notifyScrapper.emit(
            '<font color=green><b>------------------ Finish! ------------------------- </b></font>')

    def scrapData(self):
        try:
            total = 0
            csvHeader = ['URL', 'Title', 'Price', 'Brand', 'Features', 'Material', 'Measurements', 'Category',
                         'Size', 'Color', 'Design']
            self.csv.writeCsvRow(csvHeader)
            if self.isFinished: return
            for url in self.urllist:
                if len(url) > 0:
                    url = self.regex.replaceData('(?i)\r', '', url)
                    url = self.regex.replaceData('(?i)\n', '', url)
                    url = self.regex.getSearchedData('(?i)(http.*?)$', url)
                    print 'URL: ', url
                    self.notifyScrapper.emit(('<font color=green><b>URL: %s</b></font>' % url))
                    data = self.spider.fetchData(url)
                    if data and len(data) > 0:
                        data = self.regex.reduceNewLine(data)
                        data = self.regex.reduceBlankSpace(data)
                        soup = BeautifulSoup(data)
                        soup.prettify()
                        title = ''
                        price = ''
                        size = ''
                        brand = ''
                        features = ''
                        material = ''
                        measurements = ''
                        category = ''
                        color = ''
                        design = ''
                        if soup.find('span', id='vi-lkhdr-itmTitl') is not None:
                            title = soup.find('span', id='vi-lkhdr-itmTitl').text
                        if soup.find('span', id='prcIsum'):
                            price = soup.find('span', id='prcIsum').text
                        if soup.find('div', class_='itemAttr'):
                            specchunk = soup.find('div', class_='itemAttr')
                            rows = specchunk.find_all('tr')
                            for row in rows:
                                cols = row.find_all('td')
                                for i in range(0, len(cols), 2):
                                    # if self.regex.isFoundPattern('(?i)Condition:', cols[i].text.strip()):
                                    #     conditionChunk = cols[i + 1]
                                    #     conditionChunk = self.regex.replaceData(u'(?i)<span class="infoLink u-nowrap" id="readFull">.*?</span>', '', unicode(conditionChunk))
                                    #     conditionChunk = self.regex.replaceData(u'(?i)<b class="g-hdn">.*?</b>', '', conditionChunk)
                                    #     condition = BeautifulSoup(conditionChunk).text
                                    #     print condition
                                    if self.regex.isFoundPattern('(?i)Brand:', cols[i].text.strip()):
                                        brand = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Features:', cols[i].text.strip()):
                                        features = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Material:', cols[i].text.strip()):
                                        material = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Measurements:', cols[i].text.strip()):
                                        measurements = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Category:', cols[i].text.strip()):
                                        category = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Color:', cols[i].text.strip()):
                                        color = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Design:', cols[i].text.strip()):
                                        design = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Size:', cols[i].text.strip()):
                                        size = cols[i + 1].text
                        self.notifyScrapper.emit('<font color=black><b>Writting data to csv file.</b></font>')
                        csvData = [url, title, price, brand, features, material, measurements, category, size, color, design]
                        self.notifyScrapper.emit('<font color=black><b>Data: %s</b></font>' % unicode(csvData))
                        self.csv.writeCsvRow(csvData)
                        self.notifyScrapper.emit('<font color=black><b>Successfully Written data to csv file.</b></font>')
                        total += 1
                        self.notifyScrapper.emit('<font color=green><b>Total Data scrapped: [%s]</b></font>' % str(total))
        except Exception, x:
            self.notifyScrapper.emit('<font color=red><b>Error scrapping category: %s</b></font>' % x.message)
            self.logger.error(x.message)
            print x
开发者ID:tuly,项目名称:Python-Ebay-Details,代码行数:94,代码来源:Scrapper.py

示例13: MyLinkedInMembers

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class MyLinkedInMembers(QThread):
    notifyLinkedIn = pyqtSignal(object)
    notifyMembers = pyqtSignal(object)
    cookieL = pyqtSignal(object)

    def __init__(self, spider, url, pageRange=None):
        QThread.__init__(self)
        #        self.spider = Spider()
        self.spider = spider
        self.regex = Regex()
        self.url = url
        self.startPage = None
        self.endPage = None
        if self.regex.isFoundPattern('(?i)(\d+)-(\d+)', str(pageRange).strip()):
            pageRangeFormat = self.regex.getSearchedDataGroups('(?i)(\d+)-(\d+)', str(pageRange).strip())
            self.startPage = int(pageRangeFormat.group(1))
            self.endPage = int(pageRangeFormat.group(2))
        elif self.regex.isFoundPattern('(?i)(\d+)', str(pageRange).strip()):
            pageRangeFormat = self.regex.getSearchedDataGroups('(?i)(\d+)', str(pageRange).strip())
            self.startPage = int(pageRangeFormat.group(1))
            self.endPage = self.startPage

    def run(self):
        self.getMembers(self.url)
        self.notifyLinkedIn.emit('<font color=red><b>Finish scraping members.<b></font>')

    def getMembers(self, url, pageNumber=0):
        print 'Members URL: ' + url
        self.notifyLinkedIn.emit('<font color=green><b>Start Scraping All Members.<b></font>')
        self.notifyLinkedIn.emit('<b>Wait For 15 seconds Break...<b>')
        time.sleep(15)
        self.notifyLinkedIn.emit('<b>15 seconds Break Finish.<b>')
        groupData = self.spider.fetchData(str(url).replace('&amp;', '&'))
        groupData = self.regex.reduceNewLine(groupData)
        groupData = self.regex.reduceBlankSpace(groupData)
        print groupData

        print 'page number: ' + str(pageNumber)
        if pageNumber > 0:
            harvestedMembers = []
            allMembers = self.regex.getAllSearchedData('(?i)<li class="member" id="member-[^"]*"[^>]*?>(.*?)</div>',
                groupData)
            for members in allMembers:
                memberId = self.regex.getSearchedData('(?i)data-li-memberId="([^"]*)"', members)
                memberName = self.regex.getSearchedData('(?i)data-li-fullName="([^"]*)"', members)
                memberTitle = self.regex.getSearchedData('(?i)<p class="headline">([^<]*?)</p>', members)
                memberTitle = self.regex.replaceData('(?i)&amp;', '&', memberTitle)
                harvestedMembers.append((memberId, memberName, memberTitle))
                self.notifyLinkedIn.emit('<b>Member ID: </b>%s <b>Member Name: </b>%s' % (memberId, memberName + ' (' + memberTitle + ')'))
            #            members = self.regex.getAllSearchedData(
            #                '(?i)class="send-message" data-li-memberId="([^"]*)" data-li-fullName="([^"]*)"', groupData)
            #            print members
            self.notifyMembers.emit(harvestedMembers)
            #            for member in members:
        #                print member
        #                self.notifyLinkedIn.emit('<b>Member Name: </b>%s <b>Member ID: </b>%s' % (member[1], member[0]))

        urlNext = self.regex.getSearchedData('(?i)<a href="([^"]*)"[^>]*?>\s*?<strong>\s*?next', groupData)
        if urlNext and len(urlNext) > 0:
        #            nextP = int(self.regex.getSearchedData('(?i).*?(\d+)$', urlNext.strip()))
            urlNext = self.regex.replaceData('(?i)&amp;', '&', urlNext)
            urlNext = self.regex.replaceData('(?i)split_page=\d+', 'split_page=', urlNext)
            pageNumber += 1
            if self.startPage <= pageNumber <= self.endPage:
                self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>')
                time.sleep(15)
                print 'sleep 15 s'
                self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>')
                self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber)
            elif pageNumber < self.startPage:
                pageNumber = self.startPage
                self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>')
                time.sleep(15)
                print 'page number less 0 sleep'
                self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>')
                self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber)

            if self.startPage is None and self.endPage is None:
                pageNumber += 1
                self.notifyLinkedIn.emit('<b>Wait for 15 second break...</b>')
                time.sleep(15)
                print 'page number less 0 sleep'
                self.notifyLinkedIn.emit('<b>15 second break finish!!!</b>')
                self.getMembers('http://www.linkedin.com' + urlNext + str(pageNumber), pageNumber)
开发者ID:rabbicse,项目名称:Python-LinkedIn-Scrapper,代码行数:86,代码来源:MyLinkedInMembers.py

示例14: CsProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]

#.........这里部分代码省略.........
                for category in categories:
                    print [category1Name, category2Name, category3Name, category[1]]
                    self.scrapProductsDetails(category[0], category1Name, category2Name, category3Name, category[1])

    def scrapProductsDetails(self, url, category1Name, category2Name, category3Name, category4Name):
        self.logger.debug('Product Details URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap all products under Category[%s]</b>' % category4Name)
        self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
        maxLimit = 25
        maxLimitChunk = self.spider.fetchData(url + '?mode=list')
        if maxLimitChunk and len(maxLimitChunk):
            maxLimitChunk = self.regex.reduceNewLine(maxLimitChunk)
            maxLimitChunk = self.regex.reduceBlankSpace(maxLimitChunk)
            maxLimits = self.regex.getAllSearchedData('<option value="[^"]*limit=(\d+)[^"]*"', maxLimitChunk)
            #            print maxLimits
            if maxLimits and len(maxLimits) > 0:
                maxLimit = max(map(int, maxLimits))
                #                print maxLimit

                #        self.notifyProduct.emit('<font color=blue><b>Max Limit: %s</b></font>' % str(maxLimit))

        data = self.spider.fetchData(url + '?limit=' + str(maxLimit) + '&mode=list')
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            products = self.regex.getAllSearchedData('(?i)<div class="listing-item[^"]*?">(.*?)</div>', data)
            if products and len(products) > 0:
                print len(products)
#                self.totalProducts += len(products)
                #                self.logger.debug('Total Products for %s is [%s]' % (str(len(products)), self.totalProducts))
                self.notifyProduct.emit('<font color=green><b>Total Products Found [%s] for category[%s]</b></font>' % (
                    str(len(products)), category4Name))
                for product in products:
                    productDetailUrl = self.regex.getSearchedData('(?i)<a href="([^"]*)"', product)
                    if productDetailUrl not in self.dupCsvRows0:
                    #                        self.totalProducts += 1
                        self.dupCsvRows0.append(productDetailUrl)
                        self.scrapProductDetails(productDetailUrl, category1Name, category2Name, category3Name,
                            category4Name)
                    else:
                        self.notifyProduct.emit(
                            '<font color=green><b>Already Exists This Product Under Category[%s]. Skip It.</b></font>' % category4Name)

                self.notifyProduct.emit(
                    '<font color=green><b>Total Products Scraped [%s].</b></font>' % str(self.totalProducts))

    def scrapProductDetails(self, url, category1Name, category2Name, category3Name, category4Name):
        self.logger.debug('Product Detail URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap product details under Category[%s]</b>' % category4Name)
        self.notifyProduct.emit('<font color=green><b>Product Detail URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            manufacturer = self.regex.getSearchedData(
                '(?i)<span class="manufacturer-box-label">Manufacturer:</span>([^<]*)</p>', data)
            productCode = self.regex.getSearchedData(
                '(?i)<span class="manufacturer-box-label">Model No:</span>([^<]*)</p>',
                data)
            if productCode not in self.dupCsvRows:
                self.totalProducts += 1
                self.dupCsvRows.append(productCode)
            else:
                self.notifyProduct.emit(
                    '<font color=green><b>Already Exists This Product Under Category[%s]. Skip It.</b></font>' % category4Name)
                return
开发者ID:rabbicse,项目名称:Python-CsCatering-scrapper,代码行数:70,代码来源:CsProduct.py

示例15: YtDownloadManager

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import getSearchedData [as 别名]
class YtDownloadManager(object):
    def __init__(self):
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()

    def scrapVideoDownloadUrl(self, url):
        data = self.spider.fetchData(url)
        print data
        soup = BeautifulSoup(data)
        exit(1)
        if data and len(data) > 0:
            title = self.scrapTitle(url)
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data)
            dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk)
            dlUrlParts = dlUrlChunk.split(',')
            sig = ''
            video = ''
            videoUrl = ''
            print dlUrlParts
            for dlUrlPart in dlUrlParts:
                dlUrlPart = urllib2.unquote(dlUrlPart)
                print dlUrlPart

                #                if self.regex.isFoundPattern('(?i)itag=5', dlUrlPart):
                urlPart = dlUrlPart.split(' ')
                for part in urlPart:
                    print part
                    if self.regex.isFoundPattern('(?i)sig=.*?', part):
                        sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part)

                    if self.regex.isFoundPattern('(?i)url=.*?', part):
                        video = self.regex.getSearchedData('(?i)url=(.*?)$', part)
                        print video

                videoUrl = video + '&signature=' + sig
                self.downloadDir = './test.flv'

                # print 'Video URL= ' + videoUrl
                # print self.downloadDir
                # dlPath = './test.flv'
                # print dlPath

                print '\n\n'

                # if self.downloadFile(videoUrl, dlPath) is True:
                #     break

    def scrapTitle(self, url):
    #        https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml
        xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml'
        data = self.spider.fetchData(xmlUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data)

    def downloadFile(self, url, downloadPath, retry=0):
        try:
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                                          urllib2.HTTPHandler(debuglevel=0),
                                          urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'),
                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                ('Connection', 'keep-alive')]
            #            resp = opener.open(url, timeout=10)
            resp = urllib2.urlopen(url, timeout=30)

            print resp.info()
            contentLength = resp.info()['Content-Length']
            contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength)
            totalSize = float(contentLength)
            directory = os.path.dirname(downloadPath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            dl_file = open(downloadPath, 'wb')
            currentSize = 0
            CHUNK_SIZE = 32768
            while True:
                data = resp.read(CHUNK_SIZE)
                if not data:
                    break
                currentSize += len(data)
                dl_file.write(data)

                print('============> ' + \
                      str(round(float(currentSize * 100) / totalSize, 2)) + \
                      '% of ' + str(totalSize) + ' bytes')
                notifyDl = '===> Downloaded ' + str(round(float(currentSize * 100) / totalSize, 2)) + '% of ' + str(
                    totalSize) + ' KB.'
                if currentSize >= totalSize:
                    dl_file.close()
                    return True
        except Exception, x:
            error = 'Error downloading: ' + x
            return False
开发者ID:rabbicse,项目名称:Python-YTDownloader,代码行数:101,代码来源:YtDownloadManager.py


注:本文中的utils.Regex.Regex.getSearchedData方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。