当前位置: 首页>>代码示例>>Python>>正文


Python Regex.reduceNewLine方法代码示例

本文整理汇总了Python中utils.Regex.Regex.reduceNewLine方法的典型用法代码示例。如果您正苦于以下问题:Python Regex.reduceNewLine方法的具体用法?Python Regex.reduceNewLine怎么用?Python Regex.reduceNewLine使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在utils.Regex.Regex的用法示例。


在下文中一共展示了Regex.reduceNewLine方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class TopsyScrapper:
    isFinished = False

    def __init__(self, filename):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.filename = filename
        self.url = 'http://topsy.com/s?'
        self.csvWriter = Csv('topsy.csv')
        csvDataHeader = ['Keyword', 'Tweets in last 30 days', 'Topsy Sentiment Score', ' Date of scrape']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()
        self.csvWriter.closeWriter()

    def scrapData(self):
        try:
            file = open(self.filename, 'rb')
            for line in file.readlines():
                if self.isFinished: return
                line = self.regex.replaceData('\r+', '', line)
                line = self.regex.reduceNewLine(line)
                line = self.regex.reduceBlankSpace(line)
                line = line.strip()
                if len(line) > 0:
                    params = urllib.urlencode({'q': line, 'window': 'm', 'type': 'tweet'})
                    url = self.url + params
                    self.scrapBrowserData(url, line)
        except Exception, x:
            print x
开发者ID:tuly,项目名称:Python-Internet-Harvesting,代码行数:35,代码来源:TopsyScrapper.py

示例2: __init__

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class GoogleFinanceScrapper:
    isFinished = False

    def __init__(self, filename):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.filename = filename
        self.url = 'https://www.google.com/finance?'
        self.main_url = 'https://www.google.com'
        self.csvWriter = Csv('google_finance.csv')
        csvDataHeader = ['Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()
        self.csvWriter.closeWriter()

    def scrapData(self):
        try:
            file = open(self.filename, 'rb')
            for line in file.readlines():
                if self.isFinished: return
                line = self.regex.replaceData('\r+', '', line)
                line = self.regex.reduceNewLine(line)
                line = self.regex.reduceBlankSpace(line)
                line = line.strip()
                params = urllib.urlencode({'q': line})
                url = self.url + params
                self.scrapBykeyword(url, line)
        except Exception, x:
            print x
            self.logger.error('Error: ' + x.message)
开发者ID:tuly,项目名称:Python-Internet-Harvesting,代码行数:36,代码来源:GoogleFinanceScrapper.py

示例3: NisbetProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class NisbetProduct(QtCore.QThread):
    scrapProductData = QtCore.pyqtSignal(object)
    stopThread = QtCore.pyqtSignal(int)

    def __init__(self):
        QtCore.QThread.__init__(self)
        self.isExiting = False
        self.totalProducts = 0
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('nisbets.csv', 0)
        self.csvWriter = Csv('nisbets.csv')
        self.mainUrl = 'http://www.nisbets.co.uk'
        csvHeaderList = ['URL', 'Product Code', 'Product Technical Specifications', 'Product Name', 'Brand',
                         'Product Price', 'Product Short Description',
                         'Product Long Description', 'Image File Name', 'User Manual File Name',
                         'Exploded View File Name', 'Spares Code', 'Accessories', 'Product Status' 'Category1',
                         'Category2', 'Category3',
                         'Category4']
        if 'URL' not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(csvHeaderList)
            self.dupCsvRows.append(csvHeaderList[0])

        self.utils = Utils()

    def run(self):
        self.scrapData()

    def stop(self):
        self.isExiting = True

    def scrapData(self):
        if self.isExiting: return
        self.scrapProductData.emit('<font color=green><b>Main URL: </b>%s</font>' % self.mainUrl)
        self.logger.debug('===== URL [' + self.mainUrl + '] =====')
        data = self.spider.fetchData(self.mainUrl)
        if data and len(str(data).strip()) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            category1Chunk = self.regex.getAllSearchedData('(?i)<li id="li-id-\d+">(.*?)</ul> </li>', data)
            if category1Chunk and len(str(category1Chunk).strip()) > 0:
                i = 0
                for category1Data in category1Chunk:
                    category1 = self.regex.getSearchedData('(?i)<a href="[^"]*">([^<]*)</a>', category1Data)
                    category2Chunk = self.regex.getAllSearchedData('(?i)<li><a href="([^"]*)">([^<]*)</a>',
                                                                   category1Data)
                    if category2Chunk and len(str(category2Chunk).strip()) > 0:
                        for category2Data in category2Chunk:
                            try:
                                self.scrapCategory2Data(self.mainUrl + category2Data[0], category1, category2Data[1])
                            except Exception, x:
                                self.logger.error(x)
        self.scrapProductData.emit('<font color=red><b>Finish Scraping Product data from %s</b></font>' % self.mainUrl)
开发者ID:rabbicse,项目名称:Python-Nisbets-Scrapper,代码行数:57,代码来源:NisbetProduct.py

示例4: PentaTransaction

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class PentaTransaction():
    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.loginUrl = 'http://www.v4.penta-transaction.com/telematica_v4/login_ing.jsp'
        self.username = 'web9501201'
        self.password = '784693'
        self.collectionUrl = 'http://www.trggroup.net/victorinox/index.php?p=124'
        self.mainUrl = 'http://www.penta-transaction.com'

    def scrapData(self):
        self.onLogin()
        data = self.spider.fetchData(self.mainUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            print data

    def onLogin(self):
        '''
        Credentials are:
        action	login_access
        i
        p
        password	sdfsdf
        username	sdfsdf
        '''
        try:
            print self.loginUrl
            loginCredentials = {'username': self.username,
                                'password': self.password}
            loginData = self.spider.login(self.loginUrl, loginCredentials)
            if loginData and len(loginData) > 0:
                loginData = self.regex.reduceNewLine(loginData)
                loginData = self.regex.reduceBlankSpace(loginData)
                print 'Login: '
                print loginData
        except Exception, x:
            print 'There was an error when login'
        return False
开发者ID:rabbicse,项目名称:python-spider-bs4,代码行数:44,代码来源:PentaTransaction.py

示例5: PaodeacucarScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class PaodeacucarScrapper(QThread):
    notifyPaode = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.mainUrl = 'http://www.paodeacucar.com.br/'
        self.url = 'http://www.paodeacucar.com.br/'
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('paodeacucar.csv', 4)
        self.csvWriter = Csv('paodeacucar.csv')
        csvDataHeader = ['SKU', 'Category', 'Subcategory', 'Name', 'URL', 'URL Image', 'Details',
                         'Nutrients Table html code', 'Price from, 28/abr/14', '28/abr/14']
        if 'URL' not in self.dupCsvRows:
            self.dupCsvRows.append(csvDataHeader)
            self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()

    def scrapData(self):
        try:
            print 'Main URL: ', self.url
            self.notifyPaode.emit(('<font color=green><b>Main URL: %s</b></font>' % self.url))
            data = self.spider.fetchData(self.url)
            if data and len(data) > 0:
                data = self.regex.reduceNewLine(data)
                data = self.regex.reduceBlankSpace(data)
                soup = BeautifulSoup(data)
                categories = soup.find('nav', class_='items-wrapper').find_all('li', class_=re.compile('\s*item\s*'))
                print 'Total Categories: ', len(categories)
                self.notifyPaode.emit(('<font color=black><b>Total Categories: %s</b></font>' % str(len(categories))))
                for category in categories:
                    if category.a is not None:
                        submenu_target = self.regex.replaceData('#', '', category.a.get('data-target'))
                        sub_categories = soup.find('ul', id=submenu_target).find_all('li', class_='item')
                        print 'Total Sub Categories: ', len(sub_categories)
                        self.notifyPaode.emit(('<font color=black><b>Total Subcategories: %s</b></font>' % str(len(sub_categories))))
                        for sub_category in sub_categories:
                            sub_category_label = sub_category.find('span', class_='label').text
                            sub_category_url = sub_category.a.get('href') if sub_category.a is not None else 'N/A'
                            self.scrapItems(sub_category_url, category.text, sub_category_label)
        except Exception, x:
            self.logger.error(x.message)
            print x
开发者ID:tuly,项目名称:Python-Paodeacucar-Spider,代码行数:50,代码来源:PaodeacucarScrapper.py

示例6: CsBrands

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class CsBrands(QThread):
    notifyBrand = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow("cs_Brands.csv")
        self.csvWriter = Csv("cs_Brands.csv")
        self.mainUrl = "http://www.cs-catering-equipment.co.uk/brands"
        self.isExiting = False
        headerData = [
            "URL",
            "Parent Category",
            "Brand Category",
            "Brand Description",
            "Image File",
            "Product Codes in this category",
        ]
        if headerData not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(headerData)

    def run(self):
        self.scrapBrands()
        self.notifyBrand.emit("<font color=red><b>Finished Scraping All Brands.</b></font>")

    def scrapBrands(self):
        self.notifyBrand.emit("<font color=green><b>Main URL: %s<b></font>" % self.mainUrl)
        self.notifyBrand.emit("<b>Try To scrap All Brands.<b>")
        data = self.spider.fetchData(self.mainUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data)
            if brandChunks and len(brandChunks) > 0:
                for brandChunk in brandChunks:
                    brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk)
                    self.notifyBrand.emit("<b>Total Brands Found: %s<b>" % str(len(brands)))
                    if brands and len(brands) > 0:
                        for brand in brands:
                            try:
                                self.scrapBrandInfo(brand[0], "Shop By Brand", brand[1])
                            except Exception, x:
                                self.logger.error(x)
开发者ID:rabbicse,项目名称:Python-Bertos-Scrapper,代码行数:49,代码来源:CsBrands.py

示例7: CsTest

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class CsTest(QThread):
    notifyProduct = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows0 = dupCsvReader.readCsvRow('cs_product.csv', 0)
        self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 1)
        self.csvWriter = Csv('cs_product.csv')
        #        self.mainUrl = 'http://www.cs-catering-equipment.co.uk/'
        self.mainUrl = 'http://www.cs-catering-equipment.co.uk/brands'
        self.utils = Utils()
        if 'Product Code' not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(
                ['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount',
                 'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty'
                    ,
                 'Delivery',
                 'Product Image',
                 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image'])
        self.totalProducts = len(self.dupCsvRows)

    def run(self):
        self.scrapBrands()
        self.notifyProduct.emit('<font color=red><b>Finished Scraping All Brands.</b></font>')

    def scrapBrands(self):
        self.notifyProduct.emit('<font color=green><b>Main URL: %s<b></font>' % self.mainUrl)
        self.notifyProduct.emit('<b>Try To scrap All Brands.<b>')
        data = self.spider.fetchData(self.mainUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            brandChunks = self.regex.getAllSearchedData('(?i)<div class="man-group man-group-[a-z]">(.*?)</div>', data)
            if brandChunks and len(brandChunks) > 0:
                for brandChunk in brandChunks:
                    brands = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', brandChunk)
                    self.notifyProduct.emit('<b>Total Brands Found: %s<b>' % str(len(brands)))
                    if brands and len(brands) > 0:
                        for brand in brands:
                            try:
                                self.scrapBrandInfo(brand[0], 'Shop By Brand', brand[1])
                            except Exception, x:
                                self.logger.error(x)
开发者ID:rabbicse,项目名称:Python-CsCatering-scrapper,代码行数:49,代码来源:CsTest.py

示例8: AmazonScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class AmazonScrapper():
    def __init__(self, url):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.url = url
        self.base_product_url = 'http://www.amazon.com/dp/'
        self.base_image_url = 'http://ecx.images-amazon.com/images/I/'
        self.csvWriter = Csv('amazon.csv')
        csvDataHeader = ['URL', 'HTML Path', 'Image URLS']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def scrapData(self):
        try:
            host = ('Host', 'www.amazon.com')
            data = self.spider.fetchData(self.url, host=host)
            if data:
                data = self.regex.reduceNewLine(data)
                data = self.regex.reduceBlankSpace(data)
                searchParams = self.regex.getSearchedData('(?i)var searchParams = {([^\}]*)}', data)
                searchParams = searchParams.split(',')
                seller = ''
                marketPlaceId = ''
                useMYI = ''
                for searchParam in searchParams:
                    searchParam = self.regex.reduceBlankSpace(searchParam)
                    searchParam = self.regex.replaceData('\'', '', searchParam)

                    if searchParam.startswith('seller'):
                        seller = searchParam.split(':')[1].strip()
                        seller = seller.decode('string-escape')
                    if searchParam.startswith('marketplaceID'):
                        marketPlaceId = searchParam.split(':')[1].strip()
                        marketPlaceId = marketPlaceId.decode('string-escape')
                    if searchParam.startswith('useMYI'):
                        useMYI = searchParam.split(':')[1].strip()
                        useMYI = useMYI.decode('string-escape')
                params = {'seller': seller,
                          'marketPlaceId': marketPlaceId,
                          'useMYI': useMYI}
                ajax_url = 'http://www.amazon.com/gp/aag/ajax/productWidget.html'
                self.scrapAjaxPage(ajax_url, params, host)
        except Exception, x:
            print x
开发者ID:rabbicse,项目名称:python-spider-bs4,代码行数:47,代码来源:AmazonScrapper.py

示例9: WalgreensScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class WalgreensScrapper():
    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.main_url = 'http://www.walgreens.com'
        self.url = 'http://www.walgreens.com/store/catalog/shopLanding'
        self.sitemap_xml = 'http://www.walgreens.com/sitemap.xml'
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('walgreens.csv')
        self.csvWriter = Csv('walgreens.csv')
        csvDataHeader = ['Product Name', 'Price', 'Description', 'Shipping', 'Ingredients', 'Image']
        if csvDataHeader not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(csvDataHeader)


    def scrapData(self):
        try:
            print 'First scrapping sitemap...'
            self.scrapSiteMap()

            print 'Main URL: ' + self.url
            data = self.spider.fetchData(self.url)
            if data and len(data) > 0:
                data = self.regex.reduceNewLine(data)
                data = self.regex.reduceBlankSpace(data)
                soup = BeautifulSoup(data)
                categoryBar = soup.find('div', class_='wid150 padrt5px padlt5px float-left')
                if categoryBar:
                    categories = categoryBar.find_all('li')
                    for category in categories:
                        category_url = self.main_url + category.a.get('href')
                        self.scrapCategory(category_url)
        except Exception, x:
            print x
开发者ID:rabbicse,项目名称:python-spider-bs4,代码行数:38,代码来源:WalgreensScrapper.py

示例10: WpScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class WpScrapper():
    def __init__(self, input_file, output_file):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.input_file = input_file
        self.output_file = output_file

    def scrapData(self):
        csv_writer = csv.writer(open(self.output_file, 'wb'), delimiter=';')
        with open(self.input_file, 'rb') as csvfile:
            csv_rows = csv.reader(csvfile, delimiter=';')
            rows = list(csv_rows)
            total = len(rows)
            counter = 0
            for row in rows:
                counter += 1
                print '---------------- Checking [%d] of [%d] records. ----------------------' % (counter, total)
                self.logger.debug('Checking %d of %d records.' % (counter, total))
                domain = 'http://' + row[0] + '/wp-login.php'
                https_domain = 'https://' + row[0] + '/wp-login.php'
                wp_admin = 'http://' + row[0] + '/wp-admin/'
                https_wp_admin = 'https://' + row[0] + '/wp-admin/'
                username = row[1]
                password = row[2]
                status = 0
                print 'Login Credential => Domain: ' + domain + ' User: ' + username + ' Password: ' + password
                self.logger.debug('Login Credential => Domain: ' + domain + ' User: ' + username + ' Password: ' + password)
                if self.onLogin(domain, https_domain, wp_admin, https_wp_admin, username, password) is True:
                    print 'Successfully logged in.'
                    self.logger.debug('Successfully logged in.')
                    status = 1
                else:
                    print 'Login failed!'
                    self.logger.debug('Login failed!')
                csv_writer.writerow([row[0], username, password, status])
                print '---------------- End of checking [%d] of [%d] records. ----------------------' % (counter, total)
                print '\n\n'


    def onLogin(self, url, https_url, wp_url, https_wp_url, username, password):
        '''
        Credentials are:
        action	login_access
        i
        p
        password	sdfsdf
        username	sdfsdf
        '''
        try:
            loginCredentials = {'log': username,
                                'pwd': password,
                                'redirect_to': wp_url}
            print 'Credentials', loginCredentials
            print 'Please wait...Try to login with your credentials.'
            redirected_url, loginData = self.spider.login(url, loginCredentials)
            print 'redirected url: ', redirected_url
            if loginData and len(loginData) > 0:
                loginData = self.regex.reduceNewLine(loginData)
                loginData = self.regex.reduceBlankSpace(loginData)
                print 'After login data: ', loginData
            if redirected_url is not None and redirected_url.strip() == wp_url.strip(): return True

            # if loginData and len(loginData) > 0:
            #     loginData = self.regex.reduceNewLine(loginData)
            #     loginData = self.regex.reduceBlankSpace(loginData)
            # soup = BeautifulSoup(loginData)
            # if soup.find('div', {'id': 'login_error'}):
            #     return False
            # else:
            #     return True
        except Exception, x:
            print x
            print 'There was an error when login with http'

        try:
            https_loginCredentials = {'log': username,
                                      'pwd': password,
                                      'redirect_to': https_wp_url}
            print 'Credentials', https_loginCredentials
            print 'Please wait...Try to login with your credentials.'
            https_redirected_url, https_login_data = self.spider.login(https_url, https_loginCredentials)
            if https_redirected_url is not None and https_redirected_url.strip() == https_wp_url.strip(): return True
        except Exception, x:
            print x
            print 'There was an error when login with https'
开发者ID:HughP,项目名称:Python-WP-Login,代码行数:89,代码来源:WpScrapper.py

示例11: MyLinkedIn

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class MyLinkedIn(QThread):
    notifyLinkedIn = pyqtSignal(object)
    notifyMember = pyqtSignal(object)
    cookieL = pyqtSignal(object)

    def __init__(self, username, password):
        QThread.__init__(self)
        self.spider = Spider()
        self.regex = Regex()
        self.username = username
        self.password = password

    def run(self):
        if self.login():
            self.getAllGroups()

    def login(self):
        print "login start"
        self.notifyLinkedIn.emit("<b>Trying to login. Please wait...</b>")
        loginPageData = self.spider.fetchData("https://www.linkedin.com/uas/login?goback=&trk=hb_signin")
        loginPageData = self.regex.reduceNewLine(loginPageData)
        loginPageData = self.regex.reduceBlankSpace(loginPageData)

        ## <input type="hidden" name="session_redirect" value="" id="session_redirect-login"><input type="hidden" name="csrfToken" value="ajax:9073845200579364133" id="csrfToken-login"><input type="hidden" name="sourceAlias" value="0_7r5yezRXCiA_H0CRD8sf6DhOjTKUNps5xGTqeX8EEoi" id="sourceAlias-login">
        self.sessionRedirect = self.regex.getSearchedData(
            '(?i)<input type="hidden" name="session_redirect" value="([^"]*)"', loginPageData
        )
        self.token = self.regex.getSearchedData(
            '(?i)<input type="hidden" name="csrfToken" value="([^"]*)"', loginPageData
        )
        self.alias = self.regex.getSearchedData(
            '(?i)<input type="hidden" name="sourceAlias" value="([^"]*)"', loginPageData
        )

        loginParam = {
            "csrfToken": self.token,
            "isJsEnabled": "true",
            "session_key": self.username,
            "session_password": self.password,
            #                      'session_key': '[email protected]',
            #                      'session_password': 'ubuntu36',
            "session_redirect": self.sessionRedirect,
            "signin": "Sign In",
            "sourceAlias": self.alias,
            "source_app": "",
        }
        print loginParam
        print "start login"
        time.sleep(5)
        loginData = self.spider.login("https://www.linkedin.com/uas/login-submit", loginParam)
        loginData = self.regex.reduceNewLine(loginData)
        loginData = self.regex.reduceBlankSpace(loginData)
        #        print loginData
        isLoggedIn = self.regex.isFoundPattern('(?i)<li class="signout">', loginData)

        if isLoggedIn:
            self.notifyLinkedIn.emit("<font color=green><b>Successfully Logged In.</b></font>")
            print "login success"
            self.cookieL.emit(self.spider)
            return True
        else:
            self.notifyLinkedIn.emit(
                "<font color=red><b>Something wrong with logging in. Please try again or check manually with this username/password</b></font>"
            )
            return False

    def getAllGroups(self):
        print "start groups"
        self.notifyLinkedIn.emit("<font color=green><b>Start Scraping All Groups.</b></font>")
        self.notifyLinkedIn.emit("<b>Wait for 15 second break...</b>")
        time.sleep(15)
        self.notifyLinkedIn.emit("<b>15 second break finish!!!</b>")
        self.notifyLinkedIn.emit("<font color=green><b>Fetching data for scraping your groups.</b></font>")
        groupsUrl = "http://www.linkedin.com/myGroups?trk=hb_side_grps_top"
        groupsData = self.spider.fetchData(groupsUrl)
        self.notifyLinkedIn.emit("<font color=green><b>Data fetching complete for scraping your groups.</b></font>")
        if groupsData is not None and len(groupsData) > 0:
            print "starting groups"
            groupsData = self.regex.reduceNewLine(groupsData)
            groupsData = self.regex.reduceBlankSpace(groupsData)
            print groupsData
            ## <a href="/groups?gid=72881&amp;trk=myg_ugrp_ovr" class="private" title="This group is members only">MySQL Professionals</a>
            groupInfo = self.regex.getAllSearchedData('(?i)<a href="(/groups\?gid=[^"]*)"[^>]*>([^<]*)</a>', groupsData)
            if groupInfo is not None and len(groupInfo) > 0:
                members = []
                for group in groupInfo:
                    groupUrl = "http://www.linkedin.com" + str(group[0])
                    groupName = str(group[1])
                    self.notifyLinkedIn.emit("<b>Group Name: </b>%s <b>URL: </b>%s" % (groupName, groupUrl))
                    #                    http://www.linkedin.com/groups?members=&gid=65688&trk=anet_ug_memb
                    gid = self.regex.getSearchedData("(?i)gid=(\d+)", group[0])
                    print gid
                    groupUrl = "http://www.linkedin.com/groups?members=&gid=" + gid + "&trk=anet_ug_memb"
                    members.append((groupName, groupUrl))
                self.notifyMember.emit(members)
            self.notifyLinkedIn.emit("<font color=red><b>Finish Scraping All Groups.</b></font>")
开发者ID:rabbicse,项目名称:Python-LinkedIn-Scrapper,代码行数:98,代码来源:MyLinkedIn.py

示例12: YoutubeScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class YoutubeScrapper(object):
    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()


    def scrapVideoDownloadUrl(self, url, filename=None):
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            title = self.scrapTitle(url)
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data)
            dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk)
            dlUrlParts = dlUrlChunk.split(',')
            sig = ''
            video = ''
            videoUrl = ''
            print dlUrlParts
            for dlUrlPart in dlUrlParts:
                dlUrlPart = urllib2.unquote(dlUrlPart)
                print dlUrlPart

                ## TODO
                if self.regex.isFoundPattern('(?i)itag=22', dlUrlPart) or self.regex.isFoundPattern('(?i)itag=18',
                                                                                                    dlUrlPart):
                    urlPart = dlUrlPart.split(' ')
                    for part in urlPart:
                        print part
                        if self.regex.isFoundPattern('(?i)sig=.*?', part):
                            sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part)

                        if self.regex.isFoundPattern('(?i)url=.*?', part):
                            video = self.regex.getSearchedData('(?i)url=(.*?)$', part)
                            print video

                    videoUrl = video + '&signature=' + sig
                    self.downloadDir = './natok.mp4'

                    print 'Video URL= ' + videoUrl
                    print self.downloadDir
                    break

            # dlPath = './natok.mp4' if filename is None else filename
            fname = self.regex.replaceData('\s+', '_', title)
            dlPath = './' + fname  + '.mp4' if filename is None else filename
            print dlPath
            print '\n\n'
            if self.downloadFile(videoUrl, dlPath) is True:
                print 'Download complete'
        else:
            print 'No data found.'

    def scrapTitle(self, url):
        #        https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml
        xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml'
        data = self.spider.fetchData(xmlUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            print data
            return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data)

    def downloadFile(self, url, downloadPath, retry=0):
        try:
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                                          urllib2.HTTPHandler(debuglevel=0),
                                          urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'),
                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                ('Connection', 'keep-alive')]
            #            resp = opener.open(url, timeout=10)
            resp = urllib2.urlopen(url, timeout=60)
            print 'ok'

            print resp.info()
            contentLength = resp.info()['Content-Length']
            contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength)
            totalSize = float(contentLength)
            directory = os.path.dirname(downloadPath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            currentSize = 0
            dl_file = open(downloadPath, 'ab')
            try:
                if os.path.getsize(downloadPath):
                    start = os.path.getsize(downloadPath)
                    currentSize = start
                    opener.addheaders.append(('Range', 'bytes=%s-' % (start)))
            except Exception, x:
                print x

            res = opener.open(url, timeout=60)
            CHUNK_SIZE = 256 * 1024
            while True:
                data = res.read(CHUNK_SIZE)
                # data = resp.read(CHUNK_SIZE)
#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-YTDownloader,代码行数:103,代码来源:YtDownloader.py

示例13: EbagsScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class EbagsScrapper():
    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.mainUrl = 'http://www.ebags.com'
        self.url = 'http://www.ebags.com/brands'

    def scrapData(self):
        data = self.spider.fetchData(self.url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            allBrandLinks = []
            soup = BeautifulSoup(data)
            brandChunks = soup.find_all('div', class_='brandList4Col')
            if brandChunks and len(brandChunks) > 0:
                for brandChunk in brandChunks:
                    brandLinks = brandChunk.find_all('a')
                    if brandLinks and len(brandLinks) > 0:
                        for brandLink in brandLinks:
                            brandUrl = brandLink.get('href')
                            allBrandLinks.append(self.mainUrl + brandUrl)

            print 'Total brands found: ', str(len(allBrandLinks))
            for brandUrl in allBrandLinks:
                self.scrapBrandData(brandUrl)

    def scrapBrandData(self, url, page=1):
        brandUrl = url if page is 1 else url + '?page=' + str(page)
        print 'Brand URL: ', brandUrl
        data = self.spider.fetchData(brandUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            soup = BeautifulSoup(data)
            productLinks = soup.find_all('div', class_='itemProductName')
            if productLinks and len(productLinks) > 0:
                for productLink in productLinks:
                    productUrl = productLink.find('a').get('href')
                    productDetailUrl = self.mainUrl + productUrl
                    print 'Product detail url: ', productDetailUrl
                    fileName = self.regex.getSearchedData('(?i)^.*?(\d+)$', productUrl) + '.html'
                    print 'File Name: ', fileName
                    self.scrapProductDetail(self.mainUrl + productUrl, fileName)

            ## Check if any next page
            if soup.find('a', class_='pagingNext') is not None:
                nextLink = soup.find('a', class_='pagingNext').get('href')
                print nextLink
                if nextLink is not None and len(nextLink) > 0:
                    print 'Pagination found...'
                    return self.scrapBrandData(url, page + 1)

    def scrapProductDetail(self, url, filename):
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            directory = os.path.dirname('./products/' + filename)
            if not os.path.exists(directory):
                os.makedirs(directory)
            print 'File saving path: ', os.path.abspath('./products/' + filename)
            dl_file = open('./products/' + filename, 'wb')
            dl_file.write(data)
            dl_file.close()
开发者ID:rabbicse,项目名称:python-spider-bs4,代码行数:69,代码来源:EbagsScrapper.py

示例14: CsProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class CsProduct(QThread):
    notifyProduct = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        dupCsvReader = Csv()
        self.dupCsvRows0 = dupCsvReader.readCsvRow('cs_product.csv', 0)
        self.dupCsvRows = dupCsvReader.readCsvRow('cs_product.csv', 1)
        self.csvWriter = Csv('cs_product.csv')
        self.mainUrl = 'http://www.cs-catering-equipment.co.uk/'
        self.utils = Utils()
        if 'Product Code' not in self.dupCsvRows:
            self.csvWriter.writeCsvRow(
                ['URL', 'Product Code', 'Product Name', 'Manufacturer', 'List Price', 'Product Price', 'Discount',
                 'Product Short Description', 'Product Long Description', 'Product Technical Specifications', 'Warranty'
                    ,
                 'Delivery',
                 'Product Image',
                 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Brand Image'])
        self.totalProducts = len(self.dupCsvRows)

    def run(self):
        self.scrapProduct()
        self.notifyProduct.emit('<font color=red><b>Finished Scraping All products.</b></font>')

    def scrapProduct(self):
    #        self.logger.debug('Main URL: ' + self.mainUrl)
        self.notifyProduct.emit('<font color=green><b>Main URL: %s</b></font>' % self.mainUrl)
        data = self.spider.fetchData(self.mainUrl)
        data = self.regex.reduceNewLine(data)
        data = self.regex.reduceBlankSpace(data)
        self.notifyProduct.emit('<b>Try to scrap all categories.</b>')
        categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)" class="level-top" title="([^"]*)"', data)
        if categories and len(categories) > 0:
            self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
            for category in categories:
                category1Name = unicode(category[1]).strip()
                self.scrapCategory1Data(str(category[0]).strip(), category1Name)

    def scrapCategory1Data(self, url, category1Name):
    #        self.logger.debug('Category 1 URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category1Name)
        self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        data = self.regex.reduceNewLine(data)
        data = self.regex.reduceBlankSpace(data)
        categories = self.regex.getAllSearchedData(
            '(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
        if categories and len(categories) > 0:
            self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
            for category in categories:
                self.scrapCategory2Data(category[0], category1Name, category[1])

    def scrapCategory2Data(self, url, category1Name, category2Name):
    #        self.logger.debug('Category 2 URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category2Name)
        self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        data = self.regex.reduceNewLine(data)
        data = self.regex.reduceBlankSpace(data)
        categories = self.regex.getAllSearchedData(
            '(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
        if categories and len(categories) > 0:
            for category in categories:
                print 'category2: ' + category[0]
                self.scrapCategory3Data(category[0], category1Name, category2Name, category[1])

    def scrapCategory3Data(self, url, category1Name, category2Name, category3Name):
    #        self.logger.debug('Category 3 URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap all categories under Category[%s]</b>' % category3Name)
        self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            categories = self.regex.getAllSearchedData(
                '(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
            if categories and len(categories) > 0:
                for category in categories:
                    print [category1Name, category2Name, category3Name, category[1]]
                    self.scrapProductsDetails(category[0], category1Name, category2Name, category3Name, category[1])

    def scrapProductsDetails(self, url, category1Name, category2Name, category3Name, category4Name):
        self.logger.debug('Product Details URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap all products under Category[%s]</b>' % category4Name)
        self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
        maxLimit = 25
        maxLimitChunk = self.spider.fetchData(url + '?mode=list')
        if maxLimitChunk and len(maxLimitChunk):
            maxLimitChunk = self.regex.reduceNewLine(maxLimitChunk)
            maxLimitChunk = self.regex.reduceBlankSpace(maxLimitChunk)
            maxLimits = self.regex.getAllSearchedData('<option value="[^"]*limit=(\d+)[^"]*"', maxLimitChunk)
            #            print maxLimits
            if maxLimits and len(maxLimits) > 0:
                maxLimit = max(map(int, maxLimits))
                #                print maxLimit

#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-CsCatering-scrapper,代码行数:103,代码来源:CsProduct.py

示例15: MyLinkedInMessage

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import reduceNewLine [as 别名]
class MyLinkedInMessage(QThread):
    notifyLinkedIn = pyqtSignal(object)

    def __init__(self, spider, memberList, subject, message):
        QThread.__init__(self)
        #        self.spider = Spider()
        self.spider = spider
        self.regex = Regex()
        self.memberList = memberList
        self.subject = unicode(subject)
        self.message = unicode(message)

    def run(self):
        self.sendMessage()
        self.notifyLinkedIn.emit('<font color=red><b>Finish Sending All Messages.</b></font>')

    def sendMessage(self):
        print self.memberList
        for member in self.memberList:
            messageUrl = 'http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=' + str(member[1])
            print messageUrl
#            messageUrl = 'http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=' + '65471931'

#        data = self.spider.fetchData('http://www.linkedin.com/inbox/compose/dialog?insider=true&connId=65471931')
            data = self.spider.fetchData(messageUrl)
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            fromName = self.regex.getSearchedData('(?i)<input type="hidden" name="fromName" value="([^"]*)"', data)
            fromEmail = self.regex.getSearchedData('(?i)<input type="hidden" name="fromEmail" value="([^"]*)"', data)
            #        connectionIds = self.regex.getSearchedData('(?i)<input type="hidden" name="connectionIds" value="([^"]*)"', data)
            csrfToken = self.regex.getSearchedData('(?i)<input type="hidden" name="csrfToken" value="([^"]*)"', data)
            sourceAlias = self.regex.getSearchedData('(?i)<input type="hidden" name="sourceAlias" value="([^"]*)"', data)

            linkedInSubject = u'Hi ' + unicode(member[0]).split(' ')[0] + self.subject
            linkedInMessage = u'Hi ' + unicode(member[0]).split(' ')[0] + u',\n' + self.message
            print linkedInMessage
            params = {'addMoreRcpts': 'false',
                      'ajaxSubmit': 'Send Message',
                      'allowEditRcpts': 'true',
                      'body': linkedInMessage,
                      'connectionIds': str(member[1]),
                      'connectionNames': '',
                      'csrfToken': csrfToken,
                      'fromEmail': fromEmail,
                      'fromName': fromName,
                      'itemID': '',
                      'openSocialAppBodySuffix': '',
                      'showRecipeints': 'showRecipeints',
                      'sourceAlias': sourceAlias,
                      'st': '',
                      'subject': linkedInSubject,
                      'submit': 'Send Message',
                      'viewerDestinationUrl': ''}
            print params

            msgUrl = 'http://www.linkedin.com/msgToConns?displayCreate='
            data = self.spider.fetchData(msgUrl, params)
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            if self.regex.isFoundPattern('(?i)<div class="alert success">', data):
                print 'Message Sent.'
                self.notifyLinkedIn.emit('<font color=green><b>Successfully Sent Message To: %s</b></font>' % member[0])
            else:
                self.notifyLinkedIn.emit('<font color=red><b>Something Wrong during Send Message To</b></font>' % member[0])

        #        params = {'addMoreRcpts': 'false',
        #                  'ajaxSubmit': 'Send Message',
        #                  'allowEditRcpts': 'true',
        #                  'body': 'Script Test',
        #                  'connectionIds': '65471931',
        #                  'connectionNames': '',
        #                  'csrfToken': 'ajax: 6539671039643459056',
        #                  'fromEmail': '467728216',
        #                  'fromName': 'Mehedi Hasan',
        #                  'itemID': '',
        #                  'openSocialAppBodySuffix': '',
        #                  'showRecipeints': 'showRecipeints',
        #                  'sourceAlias': '0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn',
        #                  'st': '',
        #                  'subject': 'Script Test',
        #                  'submit': 'Send Message',
        #                  'viewerDestinationUrl': ''}
        #<input type="hidden" name="fromName" value="Mehedi Hasan" id="fromName-msgForm">
        # <input type="hidden" name="showRecipeints" value="showRecipeints" id="showRecipeints-msgForm">
        # <input type="hidden" name="fromEmail" value="467728216" id="fromEmail-msgForm">
        # <input type="hidden" name="connectionIds" value="65471931" id="connectionIds-msgForm">
        # <input type="hidden" name="connectionNames" value="" id="connectionNames-msgForm">
        # <input type="hidden" name="allowEditRcpts" value="true" id="allowEditRcpts-msgForm">
        # <input type="hidden" name="addMoreRcpts" value="false" id="addMoreRcpts-msgForm">
        # <input type="hidden" name="itemID" value="" id="itemID-msgForm">
        # <input type="hidden" name="openSocialAppBodySuffix" value="" id="openSocialAppBodySuffix-msgForm">
        # <input type="hidden" name="st" value="" id="st-msgForm">
        # <input type="hidden" name="viewerDestinationUrl" value="" id="viewerDestinationUrl-msgForm">
        # <input type="hidden" name="csrfToken" value="ajax:6539671039643459056" id="csrfToken-msgForm">
        # <input type="hidden" name="sourceAlias" value="0_6k2algZhQ6vbvlhlVSByxRKi0OB9NXjxrnJYWBFvfhn" id="sourceAlias-msgForm">

        """
        msgUrl1 = 'http://www.linkedin.com/msgToConns?displayCreate='
        msgParams = {}
        addMoreRcpts	false
#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-LinkedIn-Scrapper,代码行数:103,代码来源:MyLinkedInMessage.py


注:本文中的utils.Regex.Regex.reduceNewLine方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。