本文整理汇总了Python中scraper.Scraper.set_url_data方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.set_url_data方法的具体用法?Python Scraper.set_url_data怎么用?Python Scraper.set_url_data使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scraper.Scraper
的用法示例。
在下文中一共展示了Scraper.set_url_data方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_find_docs
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
def test_find_docs():
declare_test_start( 'follow_link' )
url_data = {
'url_id': 1,
'target_url': 'http://timduffy.me/',
'max_link_level': 6,
'creation_date_time': str(datetime.datetime.now()),
'doc_type': 'application/pdf',
'dispatch_datetime': str(datetime.datetime.now()),
'allowed_domains': [],
}
uid = str(uuid.uuid4())
scraper = Scraper(uid)
scraper.set_url_data(url_data)
docs = scraper.find_docs( )
print '[ TEST ] {0}'.format(json.dumps(scraper.status))
print '[ TEST ] {0}'.format(json.dumps(docs))
passed = False
if len(docs) > 0:
passed = True
declare_test_end( passed )
示例2: test_find_all_docs
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
def test_find_all_docs(url):
declare_test_start( 'follow_link' )
url_data = {
'url_id': 1,
'target_url': url, # 'http://www.scottsvilleny.org/',
'max_link_level': -1,
'creation_date_time': str(datetime.datetime.now()),
'doc_type': 'application/pdf',
'dispatch_datetime': str(datetime.datetime.now()),
'allowed_domains': [],
}
uid = str(uuid.uuid4())
scraper = Scraper(uid)
scraper.set_url_data(url_data)
docs = scraper.find_docs( )
status = scraper.status
#print '[ TEST ] {0}'.format(json.dumps(scraper.status))
#print '[ TEST ] {0}'.format(json.dumps(docs))
with open('find_docs_external_results.json','w') as f:
f.write(json.dumps(status))
with open('find_docs_external_all_docs.json', 'w') as f:
f.write(json.dumps(docs))
passed = False
if len(docs) > 0:
passed = True
declare_test_end( passed )
return docs, status
示例3: ScraperWrapper
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
#.........这里部分代码省略.........
targeturl = self.scraper.status['url_data']['target_url']
packet = {
'busy': self.scraper.status['busy'],
'link_count': self.scraper.status['link_count'],
'link_count': self.scraper.status['link_count'],
'bad_link_count': len(self.scraper.status['bad_links']),
'target_url': targeturl,
'status_datetime': str(datetime.datetime.now())
}
payload = {
'command': 'scraper_status_simple',
'source_id': self.uid,
'destination_id': 'broadcast',
'message': packet
}
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
def scraper_finished_callback(self,payload):
"""
scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper finished' command is seen.
"""
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
return
def scraper_started_callback(self,payload):
"""
scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper started' command is seen.
"""
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
return
def scraper_broadcast_document_callback(self,payload):
"""
scraperBroadcastDocCallBack() is the built in, and default, async call back for when the 'scraper finds a new document' command is seen.
"""
jbody = json.dumps(payload)
self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
return
def _scraperstart(self):
#if self.scraper.start == False:
# self.scraper.start()
#self.scraper.begin()
self.scraper.find_docs()
# message handler
def _reqcallback(self,ch,method,properties,body):
#try:
if True:
response = json.loads(body)
# commented this out because it made the logs almost impossible to read
#if self.DEBUG:
# print "Processing Message:\n\t{0}".format(response['command'])
if response['command'] == 'url_dispatch':
if response['destination_id'] == self.uid:
#print "URL Dispatch Command Seen."
#print response
if self.scraping == False:
#print "[Wrapper] Launching Scraper on URL: '{0}'".format(response['message']['targeturl'])
self.scraper.set_url_data(response['message'])
#if self.scraper.started == False:
# self.scraper.start()
if self.DEBUG:
print "Launching scraper thread ..."
self.scraping = True
self.scraper_thread = threading.Thread(target=self._scraperstart)
self.scraper_thread.start()
#self._scraperstart()
if self.DEBUG:
print " ... Scraper launched successfully."
elif response['command'] == 'scraper_finished':
if response['source_id'] == self.scraper.uid:
self.scraping = False
elif response['command'] == 'get_status':
self.broadcaststatus()
elif response['command'] == 'get_status_simple':
self.broadcastsimplestatus()
elif response['command'] == 'reset_scraper':
if response['destination_id'] == self.uid:
self.resetscraper()
elif response['command'] == 'shutdown':
if response['destination_id'] == self.uid:
print "[{0}] Shutting Down Recieved".format(self.uid)
self.stop()
elif response['command'] == 'global_shutdown':
print "Global Shutdown Recieved"
self.stop()
示例4: Scraper
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
'allowed_domains': [
],
'sleep_time': 0, # do not sleep between URL fetches
}
#try:
if True:
scraper = Scraper(
check_type=options.tracking_method,
check_type_uri=options.uri,
DEBUG=_DEBUG,
)
scraper.set_callbacks(
found_doc_callback = handle_doc,
)
scraper.set_url_data(url)
if _DEBUG == True:
print "\nStarting Scraper on {0} ...\n\n".format(options.target_url)
data = scraper.start()
if _DEBUG == True:
print "\n\nScraper complete.\n"
if _DEBUG == True:
print "BarkingOwl Scraper found {0} documents on {1}.\n\n".format(
len(data['documents']),
options.target_url,
)
if options.json_output == True:
data = scraper._data
示例5: ScraperWrapper
# 需要导入模块: from scraper import Scraper [as 别名]
# 或者: from scraper.Scraper import set_url_data [as 别名]
#.........这里部分代码省略.........
'''
if not self.stopped:
#threading.Timer(self.interval, self.broadcast_available).start()
self.bus_access.tsleep(self.broadcast_interval)
def broadcast_simple_status(self):
if self.scraper._data['url_data'] == {}:
targeturl = None
else:
targeturl = self.scraper._data['url_data']['target_url']
packet = {
'working': self.scraper._data['working'],
'url_count': len(self.scraper._data['seen_urls']),
'bad_url_count': len(self.scraper._data['bad_urls']),
'target_url': targeturl,
'status_datetime': str(datetime.datetime.now())
}
self.bus_access.send_message(
command = 'scraper_available',
destination_id = 'broadcast',
message = packet,
)
def scraper_finished_callback(self, _data):
self.bus_access.send_message(
command = 'scraper_finished',
destination_id = 'broadcast',
message = _data,
)
def scraper_started_callback(self, _data):
self.bus_access.send_message(
command = 'scraper_started',
destination_id = 'broadcast',
message = _data,
)
def scraper_broadcast_document_callback(self, _data, document):
self.bus_access.send_message(
command = 'scraper_found_document',
destination_id = 'broadcast',
message = {
'url_data': _data['url_data'],
'document': document,
},
)
def _scraperstart(self):
if self._DEBUG == True:
print "ScraperWrapper()._scraperstart(): Starting scraper ..."
documents = self.scraper.start()
if self._DEBUG == True:
print "ScraperWrapper()._scraperstart(): Scraper complete."
#print documents
self.scraping = False
def _reqcallback(self,payload): #ch,method,properties,body):
try:
response = payload
if self._DEBUG == True:
print "ScraperWrapper()._reqcallback(): new message: {0}".format(response)
if response['command'] == 'url_dispatch':
if response['destination_id'] == self.uid:
if self.scraping == False:
self.scraper.set_url_data(response['message'])
#log( "ScraperWrapper._reqcallback(): Launching scraper thread ...", self.DEBUG )
self.scraping = True
self.scraper_thread = threading.Thread(target=self._scraperstart)
self.scraper_thread.start()
#self._scraperstart()
#log( "ScraperWrapper._reqcallback(): ... Scraper launched successfully.", self.DEBUG )
#elif response['command'] == 'scraper_finished':
# if response['source_id'] == self.scraper.uid:
# self.scraping = False
elif response['command'] == 'get_status':
self.broadcast_status()
elif response['command'] == 'get_status_simple':
self.broadcast_simple_status()
elif response['command'] == 'reset_scraper':
if response['destination_id'] == self.uid:
self.resetscraper()
elif response['command'] == 'shutdown':
if response['destination_id'] == self.uid:
#log( "ScraperWrapper._reqcallback(): [{0}] Shutting Down Recieved".format(self.uid), self.DEBUG )
self.stop()
elif response['command'] == 'global_shutdown':
#log( "ScraperWrapper._reqcallback(): Global Shutdown Recieved", self.DEBUG )
self.stop()
except Exception, e:
print "ScraperWrapper._reqcallback(): ERROR: {0}".format(str(e))
print traceback.format_exc()