当前位置: 首页>>代码示例>>Python>>正文


Python Scraper.set_url_data方法代码示例

本文整理汇总了Python中scraper.Scraper.set_url_data方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.set_url_data方法的具体用法?Python Scraper.set_url_data怎么用?Python Scraper.set_url_data使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scraper.Scraper的用法示例。


在下文中一共展示了Scraper.set_url_data方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_find_docs

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
def test_find_docs():

    declare_test_start( 'follow_link' ) 

    url_data = {
        'url_id': 1,
        'target_url': 'http://timduffy.me/',
        'max_link_level': 6,
        'creation_date_time': str(datetime.datetime.now()),
        'doc_type': 'application/pdf',
        'dispatch_datetime': str(datetime.datetime.now()),
        'allowed_domains': [],
    }

    uid = str(uuid.uuid4())
    scraper = Scraper(uid)
    scraper.set_url_data(url_data)
    docs = scraper.find_docs( )

    print '[ TEST ] {0}'.format(json.dumps(scraper.status))
    print '[ TEST ] {0}'.format(json.dumps(docs))

    passed = False
    if len(docs) > 0:
        passed = True

    declare_test_end( passed )
开发者ID:reustonium,项目名称:BarkingOwl,代码行数:29,代码来源:tests.py

示例2: test_find_all_docs

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
def test_find_all_docs(url):

    declare_test_start( 'follow_link' )

    url_data = {
        'url_id': 1,
        'target_url': url, # 'http://www.scottsvilleny.org/',
        'max_link_level': -1,
        'creation_date_time': str(datetime.datetime.now()),
        'doc_type': 'application/pdf',
        'dispatch_datetime': str(datetime.datetime.now()),
        'allowed_domains': [],
    }

    uid = str(uuid.uuid4())
    scraper = Scraper(uid)
    scraper.set_url_data(url_data)
    docs = scraper.find_docs( )
    status = scraper.status
    #print '[ TEST ] {0}'.format(json.dumps(scraper.status))
    #print '[ TEST ] {0}'.format(json.dumps(docs))


    with open('find_docs_external_results.json','w') as f:
        f.write(json.dumps(status))

    with open('find_docs_external_all_docs.json', 'w') as f:
        f.write(json.dumps(docs))

    passed = False
    if len(docs) > 0:
        passed = True

    declare_test_end( passed )

    return docs, status
开发者ID:reustonium,项目名称:BarkingOwl,代码行数:38,代码来源:tests.py

示例3: ScraperWrapper

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]

#.........这里部分代码省略.........
            targeturl = self.scraper.status['url_data']['target_url']

        packet = {
            'busy': self.scraper.status['busy'],
            'link_count': self.scraper.status['link_count'],
            'link_count': self.scraper.status['link_count'],
            'bad_link_count': len(self.scraper.status['bad_links']),
            'target_url': targeturl,
            'status_datetime': str(datetime.datetime.now())
        }
        payload = {
            'command': 'scraper_status_simple',
            'source_id': self.uid,
            'destination_id': 'broadcast',
            'message': packet
        }
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

    def scraper_finished_callback(self,payload):
        """
        scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper finished' command is seen.
        """
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraper_started_callback(self,payload):
        """
        scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper started' command is seen.
        """
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraper_broadcast_document_callback(self,payload):
        """
        scraperBroadcastDocCallBack() is the built in, and default, async call back for when the 'scraper finds a new document' command is seen.
        """
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def _scraperstart(self):
        #if self.scraper.start == False:
        #    self.scraper.start()
        #self.scraper.begin()

        self.scraper.find_docs()

    # message handler
    def _reqcallback(self,ch,method,properties,body):
        #try:
        if True:
            response = json.loads(body)
            
            # commented this out because it made the logs almost impossible to read
            
            #if self.DEBUG:
            #    print "Processing Message:\n\t{0}".format(response['command'])
            if response['command'] == 'url_dispatch':
                if response['destination_id'] == self.uid:
                    #print "URL Dispatch Command Seen."
                    #print response
                    if self.scraping == False:
                        #print "[Wrapper] Launching Scraper on URL: '{0}'".format(response['message']['targeturl'])
                        self.scraper.set_url_data(response['message'])
                        #if self.scraper.started == False:
                        #    self.scraper.start()
                        if self.DEBUG:
                            print "Launching scraper thread ..."
                        self.scraping = True
                        self.scraper_thread = threading.Thread(target=self._scraperstart)
                        self.scraper_thread.start()
                        #self._scraperstart()
                        if self.DEBUG:
                            print " ... Scraper launched successfully."

            elif response['command'] == 'scraper_finished':
                if response['source_id'] == self.scraper.uid:
                    self.scraping = False

            elif response['command'] == 'get_status':
                self.broadcaststatus()

            elif response['command'] == 'get_status_simple':
                self.broadcastsimplestatus()

            elif response['command'] == 'reset_scraper':
                if response['destination_id'] == self.uid:
                    self.resetscraper()

            elif response['command'] == 'shutdown':
                if response['destination_id'] == self.uid:
                    print "[{0}] Shutting Down Recieved".format(self.uid)
                    self.stop()

            elif response['command'] == 'global_shutdown':
                print "Global Shutdown Recieved"
                self.stop()
开发者ID:reustonium,项目名称:BarkingOwl,代码行数:104,代码来源:scraperwrapper.py

示例4: Scraper

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
            'allowed_domains': [
            ],
            'sleep_time': 0, # do not sleep between URL fetches
        }

        #try:
        if True:
            scraper = Scraper(
                check_type=options.tracking_method,
                check_type_uri=options.uri,
                DEBUG=_DEBUG,
            )
            scraper.set_callbacks(
                found_doc_callback = handle_doc,
            )
            scraper.set_url_data(url)

            if _DEBUG == True:
                print "\nStarting Scraper on {0} ...\n\n".format(options.target_url)
            data = scraper.start()
            if _DEBUG == True:
                print "\n\nScraper complete.\n"

            if _DEBUG == True:
                print "BarkingOwl Scraper found {0} documents on {1}.\n\n".format(
                    len(data['documents']),
                    options.target_url,
                )

            if options.json_output == True:
                data = scraper._data
开发者ID:thequbit,项目名称:BarkingOwl,代码行数:33,代码来源:cli-scraper.py

示例5: ScraperWrapper

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]

#.........这里部分代码省略.........
            '''

            if not self.stopped:
                #threading.Timer(self.interval, self.broadcast_available).start()
                self.bus_access.tsleep(self.broadcast_interval)

    def broadcast_simple_status(self):
        if self.scraper._data['url_data'] == {}:
            targeturl = None
        else:
            targeturl = self.scraper._data['url_data']['target_url']
        packet = {
            'working': self.scraper._data['working'],
            'url_count': len(self.scraper._data['seen_urls']),
            'bad_url_count': len(self.scraper._data['bad_urls']),
            'target_url': targeturl,
            'status_datetime': str(datetime.datetime.now())
        }
        self.bus_access.send_message(
            command = 'scraper_available',
            destination_id = 'broadcast',
            message = packet,
        )

    def scraper_finished_callback(self, _data):
        self.bus_access.send_message(
            command = 'scraper_finished',
            destination_id = 'broadcast',
            message = _data,
        )

    def scraper_started_callback(self, _data):
        self.bus_access.send_message(
            command = 'scraper_started',
            destination_id = 'broadcast',
            message = _data,
        )

    def scraper_broadcast_document_callback(self, _data, document):
        self.bus_access.send_message(
            command = 'scraper_found_document',
            destination_id = 'broadcast',
            message = {
                'url_data': _data['url_data'],
                'document': document,
            },
        )

    def _scraperstart(self):
        if self._DEBUG == True:
            print "ScraperWrapper()._scraperstart(): Starting scraper ..."
        documents = self.scraper.start()
        if self._DEBUG == True:
            print "ScraperWrapper()._scraperstart(): Scraper complete."
            #print documents
        self.scraping = False

    def _reqcallback(self,payload): #ch,method,properties,body):
        try:
            response = payload
           
            if self._DEBUG == True:
                print "ScraperWrapper()._reqcallback(): new message: {0}".format(response)
 
            if response['command'] == 'url_dispatch':
                if response['destination_id'] == self.uid:
                    if self.scraping == False:
                        self.scraper.set_url_data(response['message'])
                        #log( "ScraperWrapper._reqcallback(): Launching scraper thread ...", self.DEBUG )
                        self.scraping = True
                        self.scraper_thread = threading.Thread(target=self._scraperstart)
                        self.scraper_thread.start()
                        #self._scraperstart()
                        #log( "ScraperWrapper._reqcallback(): ... Scraper launched successfully.", self.DEBUG )

            #elif response['command'] == 'scraper_finished':
            #    if response['source_id'] == self.scraper.uid:
            #        self.scraping = False

            elif response['command'] == 'get_status':
                self.broadcast_status()

            elif response['command'] == 'get_status_simple':
                self.broadcast_simple_status()

            elif response['command'] == 'reset_scraper':
                if response['destination_id'] == self.uid:
                    self.resetscraper()

            elif response['command'] == 'shutdown':
                if response['destination_id'] == self.uid:
                    #log( "ScraperWrapper._reqcallback(): [{0}] Shutting Down Recieved".format(self.uid), self.DEBUG ) 
                    self.stop()

            elif response['command'] == 'global_shutdown':
                #log( "ScraperWrapper._reqcallback(): Global Shutdown Recieved", self.DEBUG )
                self.stop()
        except Exception, e:
            print "ScraperWrapper._reqcallback(): ERROR: {0}".format(str(e))
            print traceback.format_exc()
开发者ID:thequbit,项目名称:BarkingOwl,代码行数:104,代码来源:scraperwrapper.py


注:本文中的scraper.Scraper.set_url_data方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。