当前位置: 首页>>代码示例>>Python>>正文


Python Regex.replaceData方法代码示例

本文整理汇总了Python中utils.Regex.Regex.replaceData方法的典型用法代码示例。如果您正苦于以下问题:Python Regex.replaceData方法的具体用法?Python Regex.replaceData怎么用?Python Regex.replaceData使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在utils.Regex.Regex的用法示例。


在下文中一共展示了Regex.replaceData方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: SaraivaScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class SaraivaScrapper(QThread):
    notifySaraiva = pyqtSignal(object)

    def __init__(self, urlList, category, htmlTag, replaceTag):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        self.htmlTag = self.regex.replaceData('\r+', '', htmlTag)
        self.htmlTag = self.regex.replaceData('\n+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData('\s+', ' ', self.htmlTag)
        self.htmlTag = self.regex.replaceData(r'\"+', '\"', self.htmlTag)
        self.replaceTag = replaceTag
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['Link', 'Name', 'Subtitle', 'Price', 'Synopsis and Characteristics', 'Picture']
        self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://busca.livrariasaraiva.com.br'
        self.scrapUrl = None
        self.dbHelper = DbHelper('saraiva.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifySaraiva.emit('<font color=green><b>Saraiva Main URL: %s</b></font>' % url)
                        paginationUrl, self.maxRecords = self.reformatUrl(url)
                        self.notifySaraiva.emit(
                            '<font color=black><b>Total Records: %s</b></font>' % str(self.maxRecords))
                        print 'Max records: ', self.maxRecords
                        print 'URL: ' + str(paginationUrl)
                        sortList = ['&isort=globalpop', '&isort=best', '&isort=title', '&isort=title+rev',
                                    '&isort=price+rev',
                                    '&isort=price', '&isort=date+rev']
                        for sort in sortList:
                            self.scrapResults(paginationUrl, sort)
            self.notifySaraiva.emit('<font color=red><b>Saraiva Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)
开发者ID:rabbicse,项目名称:Python-Saraiva-Scrapper,代码行数:51,代码来源:SaraivaScrapper.py

示例2: __init__

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class GoogleFinanceScrapper:
    isFinished = False

    def __init__(self, filename):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.filename = filename
        self.url = 'https://www.google.com/finance?'
        self.main_url = 'https://www.google.com'
        self.csvWriter = Csv('google_finance.csv')
        csvDataHeader = ['Ticker Symbol', 'Quarter End', 'Revenue', 'Total Revenue', 'Date of Scrape']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()
        self.csvWriter.closeWriter()

    def scrapData(self):
        try:
            file = open(self.filename, 'rb')
            for line in file.readlines():
                if self.isFinished: return
                line = self.regex.replaceData('\r+', '', line)
                line = self.regex.reduceNewLine(line)
                line = self.regex.reduceBlankSpace(line)
                line = line.strip()
                params = urllib.urlencode({'q': line})
                url = self.url + params
                self.scrapBykeyword(url, line)
        except Exception, x:
            print x
            self.logger.error('Error: ' + x.message)
开发者ID:tuly,项目名称:Python-Internet-Harvesting,代码行数:36,代码来源:GoogleFinanceScrapper.py

示例3: __init__

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class TopsyScrapper:
    isFinished = False

    def __init__(self, filename):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.filename = filename
        self.url = 'http://topsy.com/s?'
        self.csvWriter = Csv('topsy.csv')
        csvDataHeader = ['Keyword', 'Tweets in last 30 days', 'Topsy Sentiment Score', ' Date of scrape']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()
        self.csvWriter.closeWriter()

    def scrapData(self):
        try:
            file = open(self.filename, 'rb')
            for line in file.readlines():
                if self.isFinished: return
                line = self.regex.replaceData('\r+', '', line)
                line = self.regex.reduceNewLine(line)
                line = self.regex.reduceBlankSpace(line)
                line = line.strip()
                if len(line) > 0:
                    params = urllib.urlencode({'q': line, 'window': 'm', 'type': 'tweet'})
                    url = self.url + params
                    self.scrapBrowserData(url, line)
        except Exception, x:
            print x
开发者ID:tuly,项目名称:Python-Internet-Harvesting,代码行数:35,代码来源:TopsyScrapper.py

示例4: AmazonScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class AmazonScrapper(QThread):
    notifyAmazon = pyqtSignal(object)

    def __init__(self, urlList, category):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.urlList = urlList
        self.category = category
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow(category + '.csv')
        self.csvWriter = Csv(category + '.csv')
        csvDataHeader = ['SKU', 'Title', 'Sub Title', 'Price', 'Shipping Weight', 'Image URL']
        if csvDataHeader not in self.dupCsvRows:
            self.dupCsvRows.append(csvDataHeader)
            self.csvWriter.writeCsvRow(csvDataHeader)
        self.mainUrl = 'http://www.amazon.com'
        self.scrapUrl = None
        self.dbHelper = DbHelper('amazon.db')
        self.dbHelper.createTable(category)
        self.total = self.dbHelper.getTotalProduct(category)

    def run(self, retry=0):
        try:
            # self.scrapProductDetail(
            #     'http://www.amazon.com/Casio-MRW-S300H-8BVCF-Solar-Powered-Analog/dp/B00ELALKH2/ref=sr_1_544/184-7248556-2619812?s=watches&ie=UTF8&qid=1397580509&sr=1-544')
            # return
            if self.urlList is not None and len(self.urlList):
                for url in self.urlList:
                    if len(url) > 0:
                        url = self.regex.replaceData('(?i)\r', '', url)
                        url = self.regex.replaceData('(?i)\n', '', url)
                        self.notifyAmazon.emit('<font color=green><b>Amazon Main URL: %s</b></font>' % url)
                        imUrl = None
                        retry = 0
                        while imUrl is None and retry < 4:
                            imUrl = self.reformatUrl(url)
                            retry += 1
                        if imUrl is None:
                            imUrl = url
                        self.total = 0
                        print 'URL: ' + str(imUrl)
                        sortList = ['relevance-fs-browse-rank', 'price', '-price', 'reviewrank_authority',
                                    'date-desc-rank']
                        for sort in sortList:
                            self.scrapReformatData(imUrl, sort)
                        self.notifyAmazon.emit(
                            '<font color=red><b>Finish data for Amazon Main URL: %s</b></font><br /><br />' % url)
            self.notifyAmazon.emit('<font color=red><b>Amazon Data Scraping finished.</b></font>')
        except Exception, x:
            print x.message
            self.logger.error('Exception at run: ', x.message)
            if retry < 5:
                self.run(retry + 1)
开发者ID:rabbicse,项目名称:Python-Amazon-Scrapper,代码行数:58,代码来源:AmazonScrapper.py

示例5: PaodeacucarScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class PaodeacucarScrapper(QThread):
    notifyPaode = pyqtSignal(object)

    def __init__(self):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.mainUrl = 'http://www.paodeacucar.com.br/'
        self.url = 'http://www.paodeacucar.com.br/'
        dupCsvReader = Csv()
        self.dupCsvRows = dupCsvReader.readCsvRow('paodeacucar.csv', 4)
        self.csvWriter = Csv('paodeacucar.csv')
        csvDataHeader = ['SKU', 'Category', 'Subcategory', 'Name', 'URL', 'URL Image', 'Details',
                         'Nutrients Table html code', 'Price from, 28/abr/14', '28/abr/14']
        if 'URL' not in self.dupCsvRows:
            self.dupCsvRows.append(csvDataHeader)
            self.csvWriter.writeCsvRow(csvDataHeader)

    def run(self):
        self.scrapData()

    def scrapData(self):
        try:
            print 'Main URL: ', self.url
            self.notifyPaode.emit(('<font color=green><b>Main URL: %s</b></font>' % self.url))
            data = self.spider.fetchData(self.url)
            if data and len(data) > 0:
                data = self.regex.reduceNewLine(data)
                data = self.regex.reduceBlankSpace(data)
                soup = BeautifulSoup(data)
                categories = soup.find('nav', class_='items-wrapper').find_all('li', class_=re.compile('\s*item\s*'))
                print 'Total Categories: ', len(categories)
                self.notifyPaode.emit(('<font color=black><b>Total Categories: %s</b></font>' % str(len(categories))))
                for category in categories:
                    if category.a is not None:
                        submenu_target = self.regex.replaceData('#', '', category.a.get('data-target'))
                        sub_categories = soup.find('ul', id=submenu_target).find_all('li', class_='item')
                        print 'Total Sub Categories: ', len(sub_categories)
                        self.notifyPaode.emit(('<font color=black><b>Total Subcategories: %s</b></font>' % str(len(sub_categories))))
                        for sub_category in sub_categories:
                            sub_category_label = sub_category.find('span', class_='label').text
                            sub_category_url = sub_category.a.get('href') if sub_category.a is not None else 'N/A'
                            self.scrapItems(sub_category_url, category.text, sub_category_label)
        except Exception, x:
            self.logger.error(x.message)
            print x
开发者ID:tuly,项目名称:Python-Paodeacucar-Spider,代码行数:50,代码来源:PaodeacucarScrapper.py

示例6: AmazonScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class AmazonScrapper():
    def __init__(self, url):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        self.url = url
        self.base_product_url = 'http://www.amazon.com/dp/'
        self.base_image_url = 'http://ecx.images-amazon.com/images/I/'
        self.csvWriter = Csv('amazon.csv')
        csvDataHeader = ['URL', 'HTML Path', 'Image URLS']
        self.csvWriter.writeCsvRow(csvDataHeader)

    def scrapData(self):
        try:
            host = ('Host', 'www.amazon.com')
            data = self.spider.fetchData(self.url, host=host)
            if data:
                data = self.regex.reduceNewLine(data)
                data = self.regex.reduceBlankSpace(data)
                searchParams = self.regex.getSearchedData('(?i)var searchParams = {([^\}]*)}', data)
                searchParams = searchParams.split(',')
                seller = ''
                marketPlaceId = ''
                useMYI = ''
                for searchParam in searchParams:
                    searchParam = self.regex.reduceBlankSpace(searchParam)
                    searchParam = self.regex.replaceData('\'', '', searchParam)

                    if searchParam.startswith('seller'):
                        seller = searchParam.split(':')[1].strip()
                        seller = seller.decode('string-escape')
                    if searchParam.startswith('marketplaceID'):
                        marketPlaceId = searchParam.split(':')[1].strip()
                        marketPlaceId = marketPlaceId.decode('string-escape')
                    if searchParam.startswith('useMYI'):
                        useMYI = searchParam.split(':')[1].strip()
                        useMYI = useMYI.decode('string-escape')
                params = {'seller': seller,
                          'marketPlaceId': marketPlaceId,
                          'useMYI': useMYI}
                ajax_url = 'http://www.amazon.com/gp/aag/ajax/productWidget.html'
                self.scrapAjaxPage(ajax_url, params, host)
        except Exception, x:
            print x
开发者ID:rabbicse,项目名称:python-spider-bs4,代码行数:47,代码来源:AmazonScrapper.py

示例7: Scrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class Scrapper(QThread):
    notifyScrapper = pyqtSignal(object)
    isFinished = False

    def __init__(self, urllist):
        QThread.__init__(self)
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()
        print urllist
        self.urllist = urllist
        self.csv = Csv('scrapper.csv')

    def run(self):
        self.scrapData()
        self.notifyScrapper.emit(
            '<font color=green><b>------------------ Finish! ------------------------- </b></font>')

    def scrapData(self):
        try:
            total = 0
            csvHeader = ['URL', 'Title', 'Price', 'Brand', 'Features', 'Material', 'Measurements', 'Category',
                         'Size', 'Color', 'Design']
            self.csv.writeCsvRow(csvHeader)
            if self.isFinished: return
            for url in self.urllist:
                if len(url) > 0:
                    url = self.regex.replaceData('(?i)\r', '', url)
                    url = self.regex.replaceData('(?i)\n', '', url)
                    url = self.regex.getSearchedData('(?i)(http.*?)$', url)
                    print 'URL: ', url
                    self.notifyScrapper.emit(('<font color=green><b>URL: %s</b></font>' % url))
                    data = self.spider.fetchData(url)
                    if data and len(data) > 0:
                        data = self.regex.reduceNewLine(data)
                        data = self.regex.reduceBlankSpace(data)
                        soup = BeautifulSoup(data)
                        soup.prettify()
                        title = ''
                        price = ''
                        size = ''
                        brand = ''
                        features = ''
                        material = ''
                        measurements = ''
                        category = ''
                        color = ''
                        design = ''
                        if soup.find('span', id='vi-lkhdr-itmTitl') is not None:
                            title = soup.find('span', id='vi-lkhdr-itmTitl').text
                        if soup.find('span', id='prcIsum'):
                            price = soup.find('span', id='prcIsum').text
                        if soup.find('div', class_='itemAttr'):
                            specchunk = soup.find('div', class_='itemAttr')
                            rows = specchunk.find_all('tr')
                            for row in rows:
                                cols = row.find_all('td')
                                for i in range(0, len(cols), 2):
                                    # if self.regex.isFoundPattern('(?i)Condition:', cols[i].text.strip()):
                                    #     conditionChunk = cols[i + 1]
                                    #     conditionChunk = self.regex.replaceData(u'(?i)<span class="infoLink u-nowrap" id="readFull">.*?</span>', '', unicode(conditionChunk))
                                    #     conditionChunk = self.regex.replaceData(u'(?i)<b class="g-hdn">.*?</b>', '', conditionChunk)
                                    #     condition = BeautifulSoup(conditionChunk).text
                                    #     print condition
                                    if self.regex.isFoundPattern('(?i)Brand:', cols[i].text.strip()):
                                        brand = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Features:', cols[i].text.strip()):
                                        features = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Material:', cols[i].text.strip()):
                                        material = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Measurements:', cols[i].text.strip()):
                                        measurements = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Category:', cols[i].text.strip()):
                                        category = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Color:', cols[i].text.strip()):
                                        color = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Design:', cols[i].text.strip()):
                                        design = cols[i + 1].text
                                    if self.regex.isFoundPattern('(?i)Size:', cols[i].text.strip()):
                                        size = cols[i + 1].text
                        self.notifyScrapper.emit('<font color=black><b>Writting data to csv file.</b></font>')
                        csvData = [url, title, price, brand, features, material, measurements, category, size, color, design]
                        self.notifyScrapper.emit('<font color=black><b>Data: %s</b></font>' % unicode(csvData))
                        self.csv.writeCsvRow(csvData)
                        self.notifyScrapper.emit('<font color=black><b>Successfully Written data to csv file.</b></font>')
                        total += 1
                        self.notifyScrapper.emit('<font color=green><b>Total Data scrapped: [%s]</b></font>' % str(total))
        except Exception, x:
            self.notifyScrapper.emit('<font color=red><b>Error scrapping category: %s</b></font>' % x.message)
            self.logger.error(x.message)
            print x
开发者ID:tuly,项目名称:Python-Ebay-Details,代码行数:94,代码来源:Scrapper.py

示例8: CsCat

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]

#.........这里部分代码省略.........
                self.dupCsvRows.append(csvData)
            else:
                self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)

            subCategories = self.regex.getAllSearchedData(
                '(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
            if subCategories and len(subCategories) > 0:
                self.totalCategory += len(subCategories)
                self.notifyCategory.emit(
                    '<font color=green><b>Total Category Found [%s]</b></font>' % unicode(self.totalCategory))
                for subCategory in subCategories:
                    print subCategory
                    self.scrapSubCategory(subCategory[0], categoryName, subCategory[1])

    def scrapSubCategory(self, url, rootCategoryName, categoryName):
        self.notifyCategory.emit('<font color=green><b>Start scraping URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            self.filterCategory(data, categoryName)
            categoryDesc = self.regex.getSearchedData('(?i)<div class="category-description std">([^<]*)</div>', data)
            categoryDesc = unicode(categoryDesc).strip()
            csvData = [rootCategoryName, categoryName, categoryDesc]
            if csvData not in self.dupCsvRows:
                self.csvWriter.writeCsvRow(csvData)
                self.dupCsvRows.append(csvData)
                self.notifyCategory.emit('<b>Scraped Data: %s</b>' % unicode(csvData))
            else:
                self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)

            subCategories = self.regex.getAllSearchedData(
                '(?i)<li> <a href="([^"]*)" title="([^"]*)"[^>]*?>[^<]*?</a> </li>', data)
            if subCategories and len(subCategories) > 0:
                self.totalCategory += len(subCategories)
                self.notifyCategory.emit(
                    '<font color=green><b>Total Category Found [%s]</b></font>' % unicode(self.totalCategory))
                for subCategory in subCategories:
                    self.scrapFinalCategory(subCategory[0], categoryName, subCategory[1])

    def scrapFinalCategory(self, url, rootCategoryName, categoryName):
        self.notifyCategory.emit('<font color=green><b>Start scraping URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            self.filterCategory(data, categoryName)
            categoryDesc = self.regex.getSearchedData(u'(?i)<div class="category-description std">([^<]*)</div>', data)
            if len(categoryDesc) > 0:
                categoryDesc = categoryDesc.strip()
            csvData = [rootCategoryName, categoryName, categoryDesc]
            if csvData not in self.dupCsvRows:
                self.csvWriter.writeCsvRow(csvData)
                self.dupCsvRows.append(csvData)
                self.notifyCategory.emit('<b>Scraped Data: %s</b>' % unicode(csvData))
            else:
                self.notifyCategory.emit('<font color=green><b>Already Exits Category [%s] in csv file. Skip it.</b></font>' % categoryName)

    def filterCategory(self, data, categoryName):
    #        self.csvW = Csv(category + '.csv')
        filterData = self.regex.getSearchedData('(?i)<h4>Filter your results</h4> <dl id="narrow-by-list">(.*?)</dl>',
            data)
        if filterData and len(filterData) > 0:
            self.notifyCategory.emit('<b>Filter Data found writing to csv</b>')
            allFilters = self.regex.getAllSearchedData('(?i)<dt>([^<]*)</dt> <dd>(.*?)</dd>', filterData)
            topData = [categoryName]
            childData = []
            maxLen = 0
            for allFilter in allFilters:
                topData.append(allFilter[0])
                print 'Filter: ' + allFilter[0]
                filterName = self.regex.replaceData('(?i)<span class="price">', '', allFilter[1])
                filterName = self.regex.replaceData('(?i)</span>', '', filterName)
                filters = self.regex.getAllSearchedData('(?i)<a href=[^>]*>([^<]*)</a>', filterName)
                if filters is not None and len(filters) > 0:
                    childData.append(filters)
                    if len(filters) > maxLen:
                        maxLen = len(filters)

            if topData not in self.dupFilterCsvRows:
                self.csvW.writeCsvRow(topData)
                self.notifyCategory.emit(
                    '<font color=green><b>Filters Found For Category [%s].</b></font> <br /><b>Filters are: %s</b>' % (
                    unicode(categoryName), unicode(topData[1:])))
            else:
                self.notifyCategory.emit('<font color=green><b>Already scraped Filter For Category [%s]. Skip it.</b></font>' % categoryName)
                return

            for row in range(maxLen):
                rowData = ['']
                for columnData in childData:
                    if len(columnData) > row:
                        rowData.append(columnData[row])
                    else:
                        rowData.append('')
                print rowData
                self.csvW.writeCsvRow(rowData)
        else:
            self.notifyCategory.emit(
                '<font color=green><b>No Filter Found For Category[%s].</b></font>' % categoryName)
开发者ID:rabbicse,项目名称:Python-Bertos-Scrapper,代码行数:104,代码来源:CsCat.py

示例9: Regex

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
# -*- coding: utf-8 -*-
from utils.Regex import Regex

__author__ = "Rabbi-Tuly"

if __name__ == "__main__":
    text = """<table style="width: 187px;" align="center" background=" border="0" cellpadding="0" cellspacing="0"> <tbody> <tr> <td colspan="2"><img src="http://img.photobucket.com/albums/v402/WGT-32/1.png" /> &nbsp;<a href="http://eshops.mercadolivre.com.br/quackstore/" target="_blank"><img src="http://img.photobucket.com/albums/v402/WGT-32/2.png" border="0" /></a> <img src="http://img.photobucket.com/albums/v402/WGT-32/3.png" /> <table style="width: 100%;" border="0" cellpadding="0" cellspacing="0"> <tbody> <tr> <td style="padding-left: 60px;" background="http://img.photobucket.com/albums/v402/WGT-32/fundo.png"> <p>&nbsp;</p> <p>Você está convidado a juntar-se a um grupo que durante uma semana vai estudar com um dos maiores especialistas em liderança dos Estados Unidos. Leonard Hoffman, um famoso empresário que abandonou sua brilhante carreira para se tornar monge em um mosteiro beneditino, é o personagem central desta envolvente história criada por James C. Hunter para ensinar de forma clara e agradável os princípios fundamentais dos verdadeiros líderes. Se você tem dificuldade em fazer com que sua equipe dê o melhor de si no trabalho e gostaria de se relacionar melhor com sua família e seus amigos, vai encontrar neste livro personagens, idéias e discussões que vão abrir um novo horizonte em sua forma de lidar com os outros. É impossível ler este livro sem sair transformado. "O Monge e o Executivo" é, sobretudo, uma lição sobre como se tornar uma pessoa melhor.<br /> <br /> <b>I.S.B.N.: </b>8575421026<br /> <br /> <b>Cód. Barras: </b>9788575421024<br /> <br /> <b>Reduzido: </b>149181<br /> <br /> <b>Altura: </b>21 cm.<br /><b>Largura: </b>14 cm.<br /><b>Acabamento : </b>Brochura<br /><b>Edição : </b>1 / 2004<br /><b>Idioma : </b>Português<br /><b>País de Origem : </b>Brasil<br /><b>Número de Paginas : </b>144<br /> <br /></p> <p>&nbsp;</p> <p style="text-align: center;"></p> </td> </tr> </tbody> </table> <img src="http://img.photobucket.com/albums/v402/WGT-32/5.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/6.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/7.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/8_usa_pf.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/10.png" /><br /> <img src="http://img.photobucket.com/albums/v402/WGT-32/11.png" /></td> </tr> <tr> <td valign="top" width="186"></td> <td valign="top" width="1"></td> </tr> <tr> <td colspan="2"></td> </tr> </tbody> </table>"""
    regex = Regex()
    d = regex.replaceData(r"(?i)(?:<br\s*/>\s*)+", "<br />", text)
    print d
开发者ID:rabbicse,项目名称:Python-Saraiva-Scrapper,代码行数:12,代码来源:RegexTest.py

示例10: YoutubeScrapper

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class YoutubeScrapper(object):
    def __init__(self):
        self.logger = LogManager(__name__)
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()


    def scrapVideoDownloadUrl(self, url, filename=None):
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            title = self.scrapTitle(url)
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data)
            dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk)
            dlUrlParts = dlUrlChunk.split(',')
            sig = ''
            video = ''
            videoUrl = ''
            print dlUrlParts
            for dlUrlPart in dlUrlParts:
                dlUrlPart = urllib2.unquote(dlUrlPart)
                print dlUrlPart

                ## TODO
                if self.regex.isFoundPattern('(?i)itag=22', dlUrlPart) or self.regex.isFoundPattern('(?i)itag=18',
                                                                                                    dlUrlPart):
                    urlPart = dlUrlPart.split(' ')
                    for part in urlPart:
                        print part
                        if self.regex.isFoundPattern('(?i)sig=.*?', part):
                            sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part)

                        if self.regex.isFoundPattern('(?i)url=.*?', part):
                            video = self.regex.getSearchedData('(?i)url=(.*?)$', part)
                            print video

                    videoUrl = video + '&signature=' + sig
                    self.downloadDir = './natok.mp4'

                    print 'Video URL= ' + videoUrl
                    print self.downloadDir
                    break

            # dlPath = './natok.mp4' if filename is None else filename
            fname = self.regex.replaceData('\s+', '_', title)
            dlPath = './' + fname  + '.mp4' if filename is None else filename
            print dlPath
            print '\n\n'
            if self.downloadFile(videoUrl, dlPath) is True:
                print 'Download complete'
        else:
            print 'No data found.'

    def scrapTitle(self, url):
        #        https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml
        xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml'
        data = self.spider.fetchData(xmlUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            print data
            return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data)

    def downloadFile(self, url, downloadPath, retry=0):
        try:
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                                          urllib2.HTTPHandler(debuglevel=0),
                                          urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'),
                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                ('Connection', 'keep-alive')]
            #            resp = opener.open(url, timeout=10)
            resp = urllib2.urlopen(url, timeout=60)
            print 'ok'

            print resp.info()
            contentLength = resp.info()['Content-Length']
            contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength)
            totalSize = float(contentLength)
            directory = os.path.dirname(downloadPath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            currentSize = 0
            dl_file = open(downloadPath, 'ab')
            try:
                if os.path.getsize(downloadPath):
                    start = os.path.getsize(downloadPath)
                    currentSize = start
                    opener.addheaders.append(('Range', 'bytes=%s-' % (start)))
            except Exception, x:
                print x

            res = opener.open(url, timeout=60)
            CHUNK_SIZE = 256 * 1024
            while True:
                data = res.read(CHUNK_SIZE)
                # data = resp.read(CHUNK_SIZE)
#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-YTDownloader,代码行数:103,代码来源:YtDownloader.py

示例11: CsProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]

#.........这里部分代码省略.........
            price = ''
            if priceChunk and len(priceChunk) > 0:
                price = self.regex.getSearchedData('(?i)([0-9,.]+)', priceChunk)

            deliveryChunk = self.regex.getSearchedData('(?i)<div class="delivery">(.*?)</div>', data)
            delivery = ''
            if deliveryChunk and len(deliveryChunk) > 0:
                delivery = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', deliveryChunk)

            warrantyChunk = self.regex.getSearchedData('(?i)<div class="warranty">(.*?)</div>', data)
            warranty = ''
            if warrantyChunk and len(warrantyChunk) > 0:
                warranty = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', warrantyChunk)

            ## Download and save product images
            productImageUrl = self.regex.getSearchedData(
                '(?i)src="(http://assets.cs-catering-equipment.co.uk/media/catalog/product/cache/1/image/256x/[^"]*)"',
                data)
            print productImageUrl
            productImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', productImageUrl)
            if productImage and len(productImage) > 0:
                print productImage
                self.notifyProduct.emit(
                    '<font color=green><b>Downloading Product Image [%s]. Please wait...</b></font>' % productImage)
                self.downloadFile(productImageUrl, 'product_image/' + productImage)
                self.notifyProduct.emit('<font color=green><b>Downloaded Product Image [%s].</b></font>' % productImage)
                #                self.utils.downloadFile(productImageUrl, 'product_image/' + productImage)

            ## Download and save brand images
            brandImageUrl = self.regex.getSearchedData(
                '(?i)<div class="manufacturer-box-left"><a href="[^"]*"[^>]*?><img src="([^"]*)"', data)
            brandImage = ''
            if brandImageUrl and len(brandImageUrl) > 0:
                brandImageUrl = self.regex.replaceData('(?i)logo/', '', brandImageUrl)
                brandImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', brandImageUrl)
                if brandImage and len(brandImage) > 0:
                    self.notifyProduct.emit(
                        '<font color=green><b>Downloading Brand Image [%s]. Please wait...</b></font>' % brandImage)
                    #                    self.utils.downloadFile(brandImageUrl, 'brand_image/' + brandImage)
                    self.downloadFile(brandImageUrl, 'brand_image/' + brandImage)
                    self.notifyProduct.emit('<font color=green><b>Downloaded Brand Image [%s].</b></font>' % brandImage)

            csvData = [url, productCode, productName, manufacturer, listPrice, price, savePrice,
                       productShortDesc, productFullDesc, productTechnicalDesc, warranty, delivery,
                       productImage,
                       category1Name, category2Name, category3Name, category4Name, brandImage]

            self.csvWriter.writeCsvRow(csvData)
            self.logger.debug(unicode(csvData))
            self.notifyProduct.emit('<b>Product Details: %s</b>' % unicode(csvData))


    def downloadFile(self, url, downloadPath, retry=0):
        print url
        self.notifyProduct.emit('<b>File URL: %s.</b>' % url)
        try:
            socket.setdefaulttimeout(10)
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                urllib2.HTTPHandler(debuglevel=0),
                urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1')]
            urllib2.install_opener(opener)
            #            resp = opener.open(url, timeout=30)
            #            resp = urllib2.urlopen(url, timeout=30)
            resp = None
开发者ID:rabbicse,项目名称:Python-CsCatering-scrapper,代码行数:70,代码来源:CsProduct.py

示例12: BetrosProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]

#.........这里部分代码省略.........
            categoryChunk = self.regex.getSearchedData('(?i)<div id="contenuto1">(.*?)</div>\s*?</div>', data)
            if categoryChunk and len(categoryChunk) > 0:
                categories = self.regex.getAllSearchedData('(?i)<a href="([^"]*)"[^>]*?>([^<]*)</a>', categoryChunk)
                if categories and len(categories) > 0:
                    self.notifyProduct.emit('<b>Total Categories Found: %s</b>' % str(len(categories)))
                    for category in categories:
                        categoryName = category[1].strip()
                        self.scrapSubCategory(str(category[0]).strip(), categoryName, dupCsvRows, csvWriter)

    def scrapSubCategory(self, url, categoryName, dupCsvRows, csvWriter):
        self.logger.debug('Category URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap subcategories for: %s</b>' % categoryName)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            subCategories = self.regex.getAllSearchedData('(?i)<li\s*?><a href="([^"]*)" title="([^"]*)"', data)
            if subCategories and len(subCategories) > 0:
                self.notifyProduct.emit(
                    '<font color=green><b>Total subcategories found %s.</b></font>' % str(len(subCategories)))
                for subCategory in subCategories:
                    subCategoryName = subCategory[1].strip()
                    self.scrapProducts(subCategory[0].strip(), categoryName, subCategoryName, dupCsvRows, csvWriter)

    def scrapProducts(self, url, categoryName, subCategoryName, dupCsvRows, csvWriter):
        self.logger.debug('Product URL: ' + url)
        self.notifyProduct.emit('<b>Product URL: %s.</b>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            categoryDescription = self.regex.getSearchedData(
                '(?i)<td class="prodottidescrizione1">\s*?<h1>[^<]*?</h1>(.*?)</td>', data)
            categoryDescription = self.regex.replaceData('(?i)<!--.*?-->', '', categoryDescription)
            categoryDescription = self.regex.replaceData('(?i)<[^>]*>', '', categoryDescription)
            productUrl = self.regex.getSearchedData('(?i)^(http://.*?)/', url)
            categoryImage = self.regex.getSearchedData(
                '(?i)<div class="boximgcat" id="boximgcatid">\s*?<a rel="shadowbox" href="([^"]*)"', data)
            categoryImageName = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_. ]*)$', categoryImage)
            categoryImageName = self.regex.replaceData('\s+', '_', categoryImageName.strip())
            if categoryImageName is not None and len(categoryImageName) > 0 and not os.path.exists(
                'category_image/' + categoryImageName):
                self.notifyProduct.emit(
                    '<font color=green><b>Downloading Category Image: </b>%s <b>Please Wait...</b></font>' % categoryImageName)
                self.downloadFile(productUrl + categoryImage, 'category_image/' + categoryImageName)
                #            self.utils.downloadFile(categoryImage, 'category_image/' + categoryImageName)
                self.notifyProduct.emit(
                    '<font color=green><b>Downloaded Category Image: %s.</b></font>' % categoryImageName)

            productChunks = self.regex.getSearchedData('(?i)<table.*?class="prodottiriga"[^>]*?>(.*?)</table>', data)
            if productChunks and len(productChunks) > 0:
                productChunk = self.regex.getAllSearchedData('(?i)<tr>(.*?</div>\s*?</td>)\s*?</tr>', productChunks)
                for products in productChunk:
                    print 'url: ' + url
                    code = self.regex.getSearchedData('(?i)Cod\. ([a-zA-Z0-9 /]+)', products).strip()
                    for dup in dupCsvRows:
                        if code == dup[4]:
                            return
                    model = self.regex.getSearchedData('(?i)Mod\. ([^<]*)<', products).strip()
                    productName = self.regex.getSearchedData('(?i)<h1>([^<]*)</h1>', products).strip()
                    self.notifyProduct.emit(
                        '<font color=green><b>Product Name: %s.</b></font>' % productName)
                    desc = self.regex.getSearchedData(
                        '(?i)<div id="prtdescrizione\d+" style="display:none">(.*?)<div class="prodotticomando">',
                        products).strip()
                    productImage = productUrl + self.regex.getSearchedData('(?i)<img src="/tpl/\.\.([^"]*)"',
开发者ID:rabbicse,项目名称:Python-Bertos-Scrapper,代码行数:70,代码来源:BetrosProduct.py

示例13: WebPageToPdf

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class WebPageToPdf(QObject):
    threadPdfStatusBar = QtCore.pyqtSignal(object)
    threadPdfWritingStatus = QtCore.pyqtSignal(object)
    threadPdfWritingDone = QtCore.pyqtSignal(int)

    def __init__(self):
        QObject.__init__(self)
        self.regex = Regex()
        self.title = ''
        self.webView = QWebView()
        self.webView.settings().setAttribute(QWebSettings.AutoLoadImages, True)
        self.webView.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
        self.webView.settings().setAttribute(QWebSettings.PluginsEnabled, True)
        self.webView.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
        self.pdfPrinter = QPrinter()
        self.webView.loadFinished.connect(self.convertToPdf)


    def setupDefaultPdfPrinter(self, fileName):
        self.pdfPrinter.setOrientation(QPrinter.Portrait)
        self.pdfPrinter.setPageSize(QPrinter.A4)
        self.pdfPrinter.setOutputFormat(QPrinter.PdfFormat)
        self.pdfPrinter.setOutputFileName(fileName)

    def printWebHtmlToPdf(self, url, filePath, fileName, groupType):
        self.tempPdfFile = filePath + 'out.pdf'
        self.filePath = filePath
        self.fileName = fileName
        self.url = url
        self.groupType = groupType
        self.setupDefaultPdfPrinter(self.tempPdfFile)
        self.threadPdfStatusBar.emit('Fetching Data From Web. Please Wait...')
        #        self.threadPdfWritingStatus.emit(
        #            '<font size=4 color=green><b>Method "%s": </b></font><font color=green><b>Fetching Data From Web for</b> %s<b>.<br />Please Wait...</b></font>' % (
        #                self.groupType, self.url))
        self.threadPdfWritingStatus.emit(
            '<font color=green><b>Fetching Data From Web for</b> %s<b>.<br />Please Wait...</b></font>' % self.url)
        self.webView.load(QUrl(url))
        self.title = self.webView.title()

    def convertToPdf(self):
        print 'Generating Pdf'
        #        self.threadPdfWritingStatus.emit(
        #            '<font size=4><b>Method "%s": </b></font><b>Generating Pdf for</b> %s<b>. Please Wait...</b>' % (
        #                self.groupType, self.url))
        self.threadPdfWritingStatus.emit(
            '<b>Generating Pdf for</b> %s<b>. Please Wait...</b>' % self.url)
        self.threadPdfStatusBar.emit('Generating Pdf. Please Wait...')
        self.webView.print_(self.pdfPrinter)
        print 'Generated Pdf'
        #        self.threadPdfWritingStatus.emit(
        #            '<font size=4><b>Method "%s": </b></font><b>Generated Pdf for</b> %s<b>. Please Wait...</b>' % (
        #                self.groupType, self.url))
        self.threadPdfWritingStatus.emit(
            '<b>Generated Pdf for</b> %s<b>. Please Wait...</b>' % self.url)
        self.threadPdfStatusBar.emit('Generated Pdf.')
        self.mergePdf()
        self.threadPdfWritingDone.emit(True)

    def mergePdf(self):
    #        self.threadPdfWritingStatus.emit(
    #            '<font size=4><b>Method "%s": </b></font><b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % (
    #                self.groupType, self.url))
        self.threadPdfWritingStatus.emit(
            '<b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % self.url)

        packet = StringIO()
        # create a new PDF with Reportlab
        pdfCanvas = canvas.Canvas(packet, pagesize=A4)
        pdfCanvas.setFont('Helvetica', 8)
        if len(self.title) is 0:
            self.title = str(self.url).split('/')[-1]
            self.title = self.regex.getSearchedData('(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', self.title)
            self.title = self.regex.replaceData('(?i)_', ' ', self.title)
        title = unicode(self.title[:57] + '...') if  (len(self.title) > 60) else unicode(self.title)
        url = self.url[:57] + '...' if (len(self.title) > 60) else self.url
        pdfCanvas.drawString(5, 830, title + '                      ' + str(url).lower())
        d = datetime.datetime.now()
        strDate = str(d.strftime("%Y-%m-%d %H-%M-%S %p"))
        pdfCanvas.drawString(420, 5, 'Created Date Time: ' + strDate)
        pdfCanvas.save()
        packet.seek(0)
        newPdf = PdfFileReader(packet)

        if not os.path.exists(self.tempPdfFile):
            return self.printWebHtmlToPdf(self.url, self.filePath, self.fileName)

        writer = PdfFileWriter()
        tmpPdfFile = file(self.tempPdfFile, 'rb')
        reader = PdfFileReader(tmpPdfFile)
        for i in range(0, (reader.getNumPages())):
            page = reader.getPage(i)
            page.mergePage(newPdf.getPage(0))
            #            page = newPdf.getPage(0)
            #            page.mergePage(reader.getPage(i))
            writer.addPage(page)
        print 'Filename: ' + self.fileName
        outputStream = file(self.filePath + self.fileName, "wb")
        writer.write(outputStream)
        outputStream.close()
#.........这里部分代码省略.........
开发者ID:rabbicse,项目名称:Python-Web2Pdf,代码行数:103,代码来源:WebPageToPdf.py

示例14: YtDownloadManager

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]
class YtDownloadManager(object):
    def __init__(self):
        self.spider = Spider()
        self.regex = Regex()
        self.utils = Utils()

    def scrapVideoDownloadUrl(self, url):
        data = self.spider.fetchData(url)
        print data
        soup = BeautifulSoup(data)
        exit(1)
        if data and len(data) > 0:
            title = self.scrapTitle(url)
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            dlUrlChunk = self.regex.getSearchedData('(?i)"url_encoded_fmt_stream_map": "([^"]*)"', data)
            dlUrlChunk = self.regex.replaceData('(?i)\\\\u0026', ' ', dlUrlChunk)
            dlUrlParts = dlUrlChunk.split(',')
            sig = ''
            video = ''
            videoUrl = ''
            print dlUrlParts
            for dlUrlPart in dlUrlParts:
                dlUrlPart = urllib2.unquote(dlUrlPart)
                print dlUrlPart

                #                if self.regex.isFoundPattern('(?i)itag=5', dlUrlPart):
                urlPart = dlUrlPart.split(' ')
                for part in urlPart:
                    print part
                    if self.regex.isFoundPattern('(?i)sig=.*?', part):
                        sig = self.regex.getSearchedData('(?i)sig=(.*?)$', part)

                    if self.regex.isFoundPattern('(?i)url=.*?', part):
                        video = self.regex.getSearchedData('(?i)url=(.*?)$', part)
                        print video

                videoUrl = video + '&signature=' + sig
                self.downloadDir = './test.flv'

                # print 'Video URL= ' + videoUrl
                # print self.downloadDir
                # dlPath = './test.flv'
                # print dlPath

                print '\n\n'

                # if self.downloadFile(videoUrl, dlPath) is True:
                #     break

    def scrapTitle(self, url):
    #        https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=9bZkp7q19f0&format=xml
        xmlUrl = 'https://www.youtube.com/oembed?url=' + str(url) + '&format=xml'
        data = self.spider.fetchData(xmlUrl)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            return self.regex.getSearchedData('(?i)<title>([^<]*)</title>', data)

    def downloadFile(self, url, downloadPath, retry=0):
        try:
            opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                                          urllib2.HTTPHandler(debuglevel=0),
                                          urllib2.HTTPSHandler(debuglevel=0))
            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'),
                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                ('Connection', 'keep-alive')]
            #            resp = opener.open(url, timeout=10)
            resp = urllib2.urlopen(url, timeout=30)

            print resp.info()
            contentLength = resp.info()['Content-Length']
            contentLength = self.regex.getSearchedData('(?i)^(\d+)', contentLength)
            totalSize = float(contentLength)
            directory = os.path.dirname(downloadPath)
            if not os.path.exists(directory):
                os.makedirs(directory)
            dl_file = open(downloadPath, 'wb')
            currentSize = 0
            CHUNK_SIZE = 32768
            while True:
                data = resp.read(CHUNK_SIZE)
                if not data:
                    break
                currentSize += len(data)
                dl_file.write(data)

                print('============> ' + \
                      str(round(float(currentSize * 100) / totalSize, 2)) + \
                      '% of ' + str(totalSize) + ' bytes')
                notifyDl = '===> Downloaded ' + str(round(float(currentSize * 100) / totalSize, 2)) + '% of ' + str(
                    totalSize) + ' KB.'
                if currentSize >= totalSize:
                    dl_file.close()
                    return True
        except Exception, x:
            error = 'Error downloading: ' + x
            return False
开发者ID:rabbicse,项目名称:Python-YTDownloader,代码行数:101,代码来源:YtDownloadManager.py

示例15: CsProduct

# 需要导入模块: from utils.Regex import Regex [as 别名]
# 或者: from utils.Regex.Regex import replaceData [as 别名]

#.........这里部分代码省略.........
    def scrapProductsDetails(self, url, category1Name, category2Name, category3Name, category4Name):
        self.logger.debug('Product Details URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap all products under Category[%s]</b>' % category4Name)
        self.notifyProduct.emit('<font color=green><b>Category URL: %s</b></font>' % url)
        data = self.spider.fetchData(url + '?limit=10000&mode=list')
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            products = self.regex.getAllSearchedData('(?i)<div class="listing-item[^"]*?">(.*?)</div>', data)
            if products and len(products) > 0:
                self.totalProducts += len(products)
                self.notifyProduct.emit('<font color=green><b>Total Products Found [%s]</b></font>' % unicode(self.totalProducts))
                for product in products:
                    productDetailUrl = self.regex.getSearchedData('(?i)<a href="([^"]*)"', product)
                    if productDetailUrl not in self.dupCsvRows:
                        self.scrapProductDetails(productDetailUrl, category1Name, category2Name, category3Name,
                            category4Name)
                    else:
                        self.notifyProduct.emit(
                            '<font color=green><b>Already Exists This Product Under Category[%s]. Skip It.</b></font>' % category4Name)

    def scrapProductDetails(self, url, category1Name, category2Name, category3Name, category4Name):
        self.logger.debug('Product Detail URL: ' + url)
        self.notifyProduct.emit('<b>Try to scrap product details under Category[%s]</b>' % category4Name)
        self.notifyProduct.emit('<font color=green><b>Product Detail URL: %s</b></font>' % url)
        data = self.spider.fetchData(url)
        if data and len(data) > 0:
            data = self.regex.reduceNewLine(data)
            data = self.regex.reduceBlankSpace(data)
            manufacturer = self.regex.getSearchedData(
                '(?i)<span class="manufacturer-box-label">Manufacturer:</span>([^<]*)</p>', data)
            productCode = self.regex.getSearchedData(
                '(?i)<span class="manufacturer-box-label">Model No:</span>([^<]*)</p>',
                data)
            productName = self.regex.getSearchedData('(?i)<div class="product-name"> <h1>([^<]*)</h1>', data)
            productTechnicalDesc = self.regex.getSearchedData('(?i)<div class="product-short-description">([^<]*)</div>'
                , data)
            productDescriptions = self.regex.getSearchedData('(?i)<div class="product-specs">(.*?)</div>', data)
            productShortDesc = ''
            productFullDesc = ''
            if productDescriptions and len(productDescriptions) > 0:
                print 'desc: ' + productDescriptions
                productShortDesc = self.regex.getSearchedData('(?i)<p>(.*?)</p>', productDescriptions)
                productFullDesc = '\n'.join(
                    self.regex.getAllSearchedData('(?i)<li>([^<]*)</li>', productDescriptions))
            listPriceChunk = self.regex.getSearchedData('(?i)<div class="rrp-price regular-price">(.*?)</div>', data)
            listPrice = ''
            if listPriceChunk and len(listPriceChunk) > 0:
                listPrice = self.regex.getSearchedData('(?i)([0-9,.]+)', listPriceChunk)

            savePriceChunk = self.regex.getSearchedData('(?i)<div class="regular-price saving-price">(.*?)</div>', data)
            savePrice = ''
            if savePriceChunk and len(savePriceChunk) > 0:
                savePrice = self.regex.getSearchedData('(?i)([0-9%]+)', savePriceChunk)

            priceChunk = self.regex.getSearchedData('(?i)<div class="[^"]*" id="product-price-\d+">(.*?)</div>', data)
            price = ''
            if priceChunk and len(priceChunk) > 0:
                price = self.regex.getSearchedData('(?i)([0-9,.]+)', priceChunk)

            deliveryChunk = self.regex.getSearchedData('(?i)<div class="delivery">(.*?)</div>', data)
            delivery = ''
            if deliveryChunk and len(deliveryChunk) > 0:
                delivery = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', deliveryChunk)

            warrantyChunk = self.regex.getSearchedData('(?i)<div class="warranty">(.*?)</div>', data)
            warranty = ''
            if warrantyChunk and len(warrantyChunk) > 0:
                warranty = self.regex.getSearchedData('(?i)<p>([^<]*)</p>', warrantyChunk)

            ## Download and save product images
            productImageUrl = self.regex.getSearchedData(
                '(?i)src="(http://assets.cs-catering-equipment.co.uk/media/catalog/product/cache/1/image/256x/[^"]*)"',
                data)
            print productImageUrl
            productImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', productImageUrl)
            if productImage and len(productImage) > 0:
                print productImage
                self.notifyProduct.emit('<b>Downloading Product Image [%s]. Please wait...</b>' % productImage)
                self.utils.downloadFile(productImageUrl, 'product_image/' + productImage)

            ## Download and save brand images
            brandImageUrl = self.regex.getSearchedData(
                '(?i)<div class="manufacturer-box-left"><a href="[^"]*"[^>]*?><img src="([^"]*)"', data)
            brandImage = ''
            if brandImageUrl and len(brandImageUrl) > 0:
                brandImageUrl = self.regex.replaceData('(?i)logo/', '', brandImageUrl)
                brandImage = self.regex.getSearchedData('(?i)/([a-zA-Z0-9-_.]*)$', brandImageUrl)
                if brandImage and len(brandImage) > 0:
                    self.notifyProduct.emit('<b>Downloading Brand Image [%s]. Please wait...</b>' % brandImage)
                    self.utils.downloadFile(brandImageUrl, 'brand_image/' + brandImage)

            csvData = [url, productCode, productName, manufacturer, listPrice, price, savePrice,
                       productShortDesc, productFullDesc, productTechnicalDesc, warranty, delivery,
                       productImage,
                       category1Name, category2Name, category3Name, category4Name, brandImage]

            self.csvWriter.writeCsvRow(csvData)
            self.logger.debug(unicode(csvData))
            self.notifyProduct.emit('<b>Product Details: %s</b>' % unicode(csvData))
开发者ID:rabbicse,项目名称:Python-Bertos-Scrapper,代码行数:104,代码来源:CsProduct.py


注:本文中的utils.Regex.Regex.replaceData方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。