当前位置: 首页>>代码示例>>Python>>正文


Python Scraper.start方法代码示例

本文整理汇总了Python中scraper.Scraper.start方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.start方法的具体用法?Python Scraper.start怎么用?Python Scraper.start使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scraper.Scraper的用法示例。


在下文中一共展示了Scraper.start方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ScraperWrapper

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import start [as 别名]
class ScraperWrapper(threading.Thread):

    def __init__(self,address='localhost',exchange='barkingowl',DEBUG=False):
        threading.Thread.__init__(self)

        self.uid = str(uuid.uuid4())
        self.address = address
        self.exchange = exchange
        self.DEBUG=DEBUG
        self.interval = 1

        # create scraper instance
        self.scraper = Scraper(uid=self.uid,DEBUG=DEBUG)
        self.scraping = False

        #setup message bus
        self.respcon = pika.BlockingConnection(pika.ConnectionParameters(
                                                           host=self.address))
        self.respchan = self.respcon.channel()
        self.respchan.exchange_declare(exchange=self.exchange,type='fanout')

        self.reqcon = pika.BlockingConnection(pika.ConnectionParameters(host=address))
        self.reqchan = self.reqcon.channel()
        self.reqchan.exchange_declare(exchange=exchange,type='fanout')
        result = self.reqchan.queue_declare(exclusive=True)
        queue_name = result.method.queue
        self.reqchan.queue_bind(exchange=exchange,queue=queue_name)
        self.reqchan.basic_consume(self.reqcallback,queue=queue_name,no_ack=True)

        if self.DEBUG:
            print "Scraper Wrapper INIT complete."

    def run(self):
        # setup call backs
        self.scraper.setFinishedCallback(self.scraperFinishedCallback)
        self.scraper.setStartedCallback(self.scraperStartedCallback)
        self.scraper.setBroadcastDocCallback(self.scraperBroadcastDocCallback)

        # broadcast availability
        self.broadcastavailable()
        self.reqchan.start_consuming()

    def stop(self):
        self.scraper.stop()
        self.reqchan.stop_consuming()

    def broadcastavailable(self):
        if self.scraper.status['busy'] == True:
            # we are currently scraping, so we are not available - don't broadcast
            return

        isodatetime = strftime("%Y-%m-%d %H:%M:%S")
        packet = {
            'availabledatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_available',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

        #
        # TODO: move this over to it's own timer, no need to do it here.
        #
        if self.scraper.stopped():
            raise Exception("Scraper Wrapper Exiting")
        else:
            threading.Timer(self.interval, self.broadcastavailable).start()
        
    def broadcaststatus(self):
        isodatetime = strftime("%Y-%m-%d %H:%M:%S")
        packet = {
            'status': self.scraper.status,
            'urldata': self.status['urldata'],
            'statusdatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_status',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        #time.sleep(.5)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

    def broadcastsimplestatus(self):
        isodatetime = strftime("%Y-%m-%d %H:%M:%S")

        if self.scraper.status['urldata'] == {}:
            targeturl = 'null'
        else:
            targeturl = self.scraper.status['urldata']['targeturl']

        packet = {
            'busy': self.scraper.status['busy'],
            'linkcount': self.scraper.status['linkcount'],
#.........这里部分代码省略.........
开发者ID:citruspi,项目名称:BarkingOwl,代码行数:103,代码来源:launcher.py

示例2: Scraper

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import start [as 别名]
        #try:
        if True:
            scraper = Scraper(
                check_type=options.tracking_method,
                check_type_uri=options.uri,
                DEBUG=_DEBUG,
            )
            scraper.set_callbacks(
                found_doc_callback = handle_doc,
            )
            scraper.set_url_data(url)

            if _DEBUG == True:
                print "\nStarting Scraper on {0} ...\n\n".format(options.target_url)
            data = scraper.start()
            if _DEBUG == True:
                print "\n\nScraper complete.\n"

            if _DEBUG == True:
                print "BarkingOwl Scraper found {0} documents on {1}.\n\n".format(
                    len(data['documents']),
                    options.target_url,
                )

            if options.json_output == True:
                data = scraper._data
                for key in data:
                    if isinstance(data[key], datetime.datetime) or \
                            isinstance(data[key], datetime.timedelta):
                        data[key] = str(data[key])
开发者ID:thequbit,项目名称:BarkingOwl,代码行数:32,代码来源:cli-scraper.py

示例3: ScraperWrapper

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import start [as 别名]
class ScraperWrapper(object): #threading.Thread):

    def __init__(self,
                 address='localhost',
                 exchange='barkingowl',
                 heartbeat_interval=30,
                 url_parameters=None,
                 broadcast_interval=5,
                 uid=str(uuid.uuid4()),
                 DEBUG=False):

        #threading.Thread.__init__(self,name="ScraperWrapper : %s" % uid)

        self.uid = str(uuid.uuid4())
        self.address = address
        self.exchange = exchange
        self.heartbeat_interval = heartbeat_interval
        self.url_parameters = url_parameters
        
        self.broadcast_interval = broadcast_interval
        self.uid = uid
        self._DEBUG=DEBUG

        print "ScraperWrapper().__init__(): Creating scraper ..."
 
        self.scraper = Scraper(
            DEBUG = self._DEBUG,
        )
        self.scraping = False
        self.scraper_thread = None

        print "ScraperWrapper().__init__(): Scraper Created."

        self.stopped = False

        self.bus_access = BusAccess(
            uid = self.uid,
            address = self.address,
            exchange = self.exchange,
            heartbeat_interval = self.heartbeat_interval,
            url_parameters = self.url_parameters,
            DEBUG = self._DEBUG,
        )
        self.bus_access.set_callback(
            callback = self._reqcallback,
        )

        #threading.Timer(self.interval, self.broadcast_available).start()
        #threading.Timer(self.interval, self.broadcast_simple_status).start()

        #self.broadcast_status()

        #log( "ScraperWrapper.__init__(): Scraper Wrapper INIT complete.", self.DEBUG )

    #def run(self):
    def start(self):
        self.scraper.set_callbacks(
            start_callback = self.scraper_started_callback,
            finished_callback = self.scraper_finished_callback,
            found_doc_callback = self.scraper_broadcast_document_callback,
            new_url_callback = None,
            bandwidth_limit_callback = None,
            memory_limit_callback = None,
            error_callback = None,
        )

        self.broadcast_status()

    def stop(self):
        self.bus_access.stop_listening()
        self.scraper.stop()
        self.stopped = True

    def reset_scraper(self):
        self.scraper.reset()

    def broadcast_status(self):

        if self._DEBUG == True:
            print "ScraperWrapper().broadcast_status(): Entering status loop."
        
        while not self.scraping and not self.stopped:

            if self._DEBUG == True:
                print "ScraperWrapper.broadcast_status() sending status pulse ..."

            if self.scraping == False and self.scraper._data['working'] == False:
                packet = {
                    'available_datetime': str(datetime.datetime.now())
                }
                self.bus_access.send_message(
                    command = 'scraper_available',
                    destination_id = 'broadcast',
                    message = packet,
                )

            '''
            packet = {
                'working': self.scraper._data['working'],
                'seen_url_count': len(self.scraper._data['seen_urls']),
#.........这里部分代码省略.........
开发者ID:thequbit,项目名称:BarkingOwl,代码行数:103,代码来源:scraperwrapper.py


注:本文中的scraper.Scraper.start方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。