当前位置: 首页>>代码示例>>Python>>正文


Python Scraper.find_docs方法代码示例

本文整理汇总了Python中scraper.Scraper.find_docs方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.find_docs方法的具体用法?Python Scraper.find_docs怎么用?Python Scraper.find_docs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scraper.Scraper的用法示例。


在下文中一共展示了Scraper.find_docs方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_find_docs

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import find_docs [as 别名]
def test_find_docs():

    declare_test_start( 'follow_link' ) 

    url_data = {
        'url_id': 1,
        'target_url': 'http://timduffy.me/',
        'max_link_level': 6,
        'creation_date_time': str(datetime.datetime.now()),
        'doc_type': 'application/pdf',
        'dispatch_datetime': str(datetime.datetime.now()),
        'allowed_domains': [],
    }

    uid = str(uuid.uuid4())
    scraper = Scraper(uid)
    scraper.set_url_data(url_data)
    docs = scraper.find_docs( )

    print '[ TEST ] {0}'.format(json.dumps(scraper.status))
    print '[ TEST ] {0}'.format(json.dumps(docs))

    passed = False
    if len(docs) > 0:
        passed = True

    declare_test_end( passed )
开发者ID:reustonium,项目名称:BarkingOwl,代码行数:29,代码来源:tests.py

示例2: test_find_all_docs

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import find_docs [as 别名]
def test_find_all_docs(url):

    declare_test_start( 'follow_link' )

    url_data = {
        'url_id': 1,
        'target_url': url, # 'http://www.scottsvilleny.org/',
        'max_link_level': -1,
        'creation_date_time': str(datetime.datetime.now()),
        'doc_type': 'application/pdf',
        'dispatch_datetime': str(datetime.datetime.now()),
        'allowed_domains': [],
    }

    uid = str(uuid.uuid4())
    scraper = Scraper(uid)
    scraper.set_url_data(url_data)
    docs = scraper.find_docs( )
    status = scraper.status
    #print '[ TEST ] {0}'.format(json.dumps(scraper.status))
    #print '[ TEST ] {0}'.format(json.dumps(docs))


    with open('find_docs_external_results.json','w') as f:
        f.write(json.dumps(status))

    with open('find_docs_external_all_docs.json', 'w') as f:
        f.write(json.dumps(docs))

    passed = False
    if len(docs) > 0:
        passed = True

    declare_test_end( passed )

    return docs, status
开发者ID:reustonium,项目名称:BarkingOwl,代码行数:38,代码来源:tests.py

示例3: ScraperWrapper

# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import find_docs [as 别名]

#.........这里部分代码省略.........
            targeturl = self.scraper.status['url_data']['target_url']

        packet = {
            'busy': self.scraper.status['busy'],
            'link_count': self.scraper.status['link_count'],
            'link_count': self.scraper.status['link_count'],
            'bad_link_count': len(self.scraper.status['bad_links']),
            'target_url': targeturl,
            'status_datetime': str(datetime.datetime.now())
        }
        payload = {
            'command': 'scraper_status_simple',
            'source_id': self.uid,
            'destination_id': 'broadcast',
            'message': packet
        }
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

    def scraper_finished_callback(self,payload):
        """
        scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper finished' command is seen.
        """
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraper_started_callback(self,payload):
        """
        scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper started' command is seen.
        """
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraper_broadcast_document_callback(self,payload):
        """
        scraperBroadcastDocCallBack() is the built in, and default, async call back for when the 'scraper finds a new document' command is seen.
        """
        jbody = json.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def _scraperstart(self):
        #if self.scraper.start == False:
        #    self.scraper.start()
        #self.scraper.begin()

        self.scraper.find_docs()

    # message handler
    def _reqcallback(self,ch,method,properties,body):
        #try:
        if True:
            response = json.loads(body)
            
            # commented this out because it made the logs almost impossible to read
            
            #if self.DEBUG:
            #    print "Processing Message:\n\t{0}".format(response['command'])
            if response['command'] == 'url_dispatch':
                if response['destination_id'] == self.uid:
                    #print "URL Dispatch Command Seen."
                    #print response
                    if self.scraping == False:
                        #print "[Wrapper] Launching Scraper on URL: '{0}'".format(response['message']['targeturl'])
                        self.scraper.set_url_data(response['message'])
                        #if self.scraper.started == False:
                        #    self.scraper.start()
                        if self.DEBUG:
                            print "Launching scraper thread ..."
                        self.scraping = True
                        self.scraper_thread = threading.Thread(target=self._scraperstart)
                        self.scraper_thread.start()
                        #self._scraperstart()
                        if self.DEBUG:
                            print " ... Scraper launched successfully."

            elif response['command'] == 'scraper_finished':
                if response['source_id'] == self.scraper.uid:
                    self.scraping = False

            elif response['command'] == 'get_status':
                self.broadcaststatus()

            elif response['command'] == 'get_status_simple':
                self.broadcastsimplestatus()

            elif response['command'] == 'reset_scraper':
                if response['destination_id'] == self.uid:
                    self.resetscraper()

            elif response['command'] == 'shutdown':
                if response['destination_id'] == self.uid:
                    print "[{0}] Shutting Down Recieved".format(self.uid)
                    self.stop()

            elif response['command'] == 'global_shutdown':
                print "Global Shutdown Recieved"
                self.stop()
开发者ID:reustonium,项目名称:BarkingOwl,代码行数:104,代码来源:scraperwrapper.py


注:本文中的scraper.Scraper.find_docs方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。