本文整理汇总了Python中scraper.Scraper.find_docs方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.find_docs方法的具体用法?Python Scraper.find_docs怎么用?Python Scraper.find_docs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scraper.Scraper
的用法示例。
在下文中一共展示了Scraper.find_docs方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_find_docs
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import find_docs [as 别名]
def test_find_docs():
declare_test_start( 'follow_link' )
url_data = {
'url_id': 1,
'target_url': 'http://timduffy.me/',
'max_link_level': 6,
'creation_date_time': str(datetime.datetime.now()),
'doc_type': 'application/pdf',
'dispatch_datetime': str(datetime.datetime.now()),
'allowed_domains': [],
}
uid = str(uuid.uuid4())
scraper = Scraper(uid)
scraper.set_url_data(url_data)
docs = scraper.find_docs( )
print '[ TEST ] {0}'.format(json.dumps(scraper.status))
print '[ TEST ] {0}'.format(json.dumps(docs))
passed = False
if len(docs) > 0:
passed = True
declare_test_end( passed )
示例2: test_find_all_docs
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import find_docs [as 别名]
def test_find_all_docs(url):
declare_test_start( 'follow_link' )
url_data = {
'url_id': 1,
'target_url': url, # 'http://www.scottsvilleny.org/',
'max_link_level': -1,
'creation_date_time': str(datetime.datetime.now()),
'doc_type': 'application/pdf',
'dispatch_datetime': str(datetime.datetime.now()),
'allowed_domains': [],
}
uid = str(uuid.uuid4())
scraper = Scraper(uid)
scraper.set_url_data(url_data)
docs = scraper.find_docs( )
status = scraper.status
#print '[ TEST ] {0}'.format(json.dumps(scraper.status))
#print '[ TEST ] {0}'.format(json.dumps(docs))
with open('find_docs_external_results.json','w') as f:
f.write(json.dumps(status))
with open('find_docs_external_all_docs.json', 'w') as f:
f.write(json.dumps(docs))
passed = False
if len(docs) > 0:
passed = True
declare_test_end( passed )
return docs, status
示例3: ScraperWrapper
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import find_docs [as 别名]
#.........这里部分代码省略.........
targeturl = self.scraper.status['url_data']['target_url']
packet = {
'busy': self.scraper.status['busy'],
'link_count': self.scraper.status['link_count'],
'link_count': self.scraper.status['link_count'],
'bad_link_count': len(self.scraper.status['bad_links']),
'target_url': targeturl,
'status_datetime': str(datetime.datetime.now())
}
payload = {
'command': 'scraper_status_simple',
'source_id': self.uid,
'destination_id': 'broadcast',
'message': packet
}
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
def scraper_finished_callback(self,payload):
"""
scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper finished' command is seen.
"""
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
return
def scraper_started_callback(self,payload):
"""
scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper started' command is seen.
"""
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
return
def scraper_broadcast_document_callback(self,payload):
"""
scraperBroadcastDocCallBack() is the built in, and default, async call back for when the 'scraper finds a new document' command is seen.
"""
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
return
def _scraperstart(self):
#if self.scraper.start == False:
# self.scraper.start()
#self.scraper.begin()
self.scraper.find_docs()
# message handler
def _reqcallback(self,ch,method,properties,body):
#try:
if True:
response = json.loads(body)
# commented this out because it made the logs almost impossible to read
#if self.DEBUG:
# print "Processing Message:\n\t{0}".format(response['command'])
if response['command'] == 'url_dispatch':
if response['destination_id'] == self.uid:
#print "URL Dispatch Command Seen."
#print response
if self.scraping == False:
#print "[Wrapper] Launching Scraper on URL: '{0}'".format(response['message']['targeturl'])
self.scraper.set_url_data(response['message'])
#if self.scraper.started == False:
# self.scraper.start()
if self.DEBUG:
print "Launching scraper thread ..."
self.scraping = True
self.scraper_thread = threading.Thread(target=self._scraperstart)
self.scraper_thread.start()
#self._scraperstart()
if self.DEBUG:
print " ... Scraper launched successfully."
elif response['command'] == 'scraper_finished':
if response['source_id'] == self.scraper.uid:
self.scraping = False
elif response['command'] == 'get_status':
self.broadcaststatus()
elif response['command'] == 'get_status_simple':
self.broadcastsimplestatus()
elif response['command'] == 'reset_scraper':
if response['destination_id'] == self.uid:
self.resetscraper()
elif response['command'] == 'shutdown':
if response['destination_id'] == self.uid:
print "[{0}] Shutting Down Recieved".format(self.uid)
self.stop()
elif response['command'] == 'global_shutdown':
print "Global Shutdown Recieved"
self.stop()