本文整理汇总了Python中DIRAC.WorkloadManagementSystem.DB.JobLoggingDB.JobLoggingDB类的典型用法代码示例。如果您正苦于以下问题:Python JobLoggingDB类的具体用法?Python JobLoggingDB怎么用?Python JobLoggingDB使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了JobLoggingDB类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
""" c'tor
"""
if pilotAgentsDB:
self.pilotAgentsDB = pilotAgentsDB
else:
self.pilotAgentsDB = PilotAgentsDB()
if jobDB:
self.jobDB = jobDB
else:
self.jobDB = JobDB()
if tqDB:
self.tqDB = tqDB
else:
self.tqDB = TaskQueueDB()
if jlDB:
self.jlDB = jlDB
else:
self.jlDB = JobLoggingDB()
if opsHelper:
self.opsHelper = opsHelper
else:
self.opsHelper = Operations()
self.log = gLogger.getSubLogger( "Matcher" )
self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
示例2: initialize
def initialize( self ):
""" Sets defaults
"""
self.am_setOption( "PollingTime", 120 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
agentTSTypes = self.am_getOption('ProductionTypes', [])
if agentTSTypes:
self.prod_types = agentTSTypes
else:
self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) )
self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
self.jobByJob = self.am_getOption( 'JobByJob', False )
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )
return S_OK()
示例3: initialize
def initialize( self, jobDB = False, logDB = False ):
""" Initialization of the Optimizer Agent.
"""
if not jobDB:
self.jobDB = JobDB()
else:
self.jobDB = jobDB
if not logDB:
self.logDB = JobLoggingDB()
else:
self.logDB = logDB
trailing = "Agent"
optimizerName = self.am_getModuleParam( 'agentName' )
if optimizerName[ -len( trailing ):].find( trailing ) == 0:
optimizerName = optimizerName[ :-len( trailing ) ]
self.am_setModuleParam( 'optimizerName', optimizerName )
self.startingMinorStatus = self.am_getModuleParam( 'optimizerName' )
self.startingMajorStatus = "Checking"
self.failedStatus = self.am_getOption( "FailedJobStatus" , 'Failed' )
self.requiredJobInfo = 'jdl'
self.am_setOption( "PollingTime", 30 )
return self.initializeOptimizer()
示例4: initialize
def initialize( self ):
"""Sets default parameters
"""
self.jobDB = JobDB()
self.logDB = JobLoggingDB()
self.am_setOption( 'PollingTime', 60 * 60 )
if not self.am_getOption( 'Enable', True ):
self.log.info( 'Stalled Job Agent running in disabled mode' )
return S_OK()
示例5: initialize
def initialize( self ):
"""Sets defaults
"""
self.am_setOption( "PollingTime", 60 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production'])
gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
self.jobByJob = self.am_getOption('JobByJob',True)
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
return S_OK()
示例6: initialize
def initialize( self ):
"""Sets defaults
"""
self.am_setOption( "PollingTime", 60 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
agentTSTypes = self.am_getOption('ProductionTypes', [])
if agentTSTypes:
self.prod_types = agentTSTypes
else:
self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
self.jobByJob = self.am_getOption('JobByJob',True)
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
return S_OK()
示例7: initialize
def initialize(self, jobDB=None, logDB=None):
""" Initialization of the Optimizer Agent.
"""
self.jobDB = JobDB() if jobDB is None else jobDB
if not self.jobDB.isValid():
dExit(1)
self.logDB = JobLoggingDB() if logDB is None else logDB
optimizerName = self.am_getModuleParam('agentName')
if optimizerName.endswith('Agent'):
optimizerName = optimizerName[:-len('Agent')]
self.am_setModuleParam('optimizerName', optimizerName)
self.startingMinorStatus = self.am_getModuleParam('optimizerName')
self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
self.am_setOption("PollingTime", 30)
return self.initializeOptimizer()
示例8: Matcher
class Matcher( object ):
""" Logic for matching
"""
def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
""" c'tor
"""
if pilotAgentsDB:
self.pilotAgentsDB = pilotAgentsDB
else:
self.pilotAgentsDB = PilotAgentsDB()
if jobDB:
self.jobDB = jobDB
else:
self.jobDB = JobDB()
if tqDB:
self.tqDB = tqDB
else:
self.tqDB = TaskQueueDB()
if jlDB:
self.jlDB = jlDB
else:
self.jlDB = JobLoggingDB()
if opsHelper:
self.opsHelper = opsHelper
else:
self.opsHelper = Operations()
self.log = gLogger.getSubLogger( "Matcher" )
self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
def selectJob( self, resourceDescription, credDict ):
""" Main job selection function to find the highest priority job matching the resource capacity
"""
startTime = time.time()
resourceDict = self._getResourceDict( resourceDescription, credDict )
negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )
if not result['OK']:
return result
result = result['Value']
if not result['matchFound']:
self.log.info( "No match found" )
raise RuntimeError( "No match found" )
jobID = result['jobId']
resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
if not resAtt['OK']:
raise RuntimeError( 'Could not retrieve job attributes' )
if not resAtt['Value']:
raise RuntimeError( "No attributes returned for job" )
if not resAtt['Value']['Status'] == 'Waiting':
self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
result = self.tqDB.deleteJob( jobID )
if not result[ 'OK' ]:
return result
raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )
self._reportStatus( resourceDict, jobID )
result = self.jobDB.getJobJDL( jobID )
if not result['OK']:
raise RuntimeError( "Failed to get the job JDL" )
resultDict = {}
resultDict['JDL'] = result['Value']
resultDict['JobID'] = jobID
matchTime = time.time() - startTime
self.log.info( "Match time: [%s]" % str( matchTime ) )
gMonitor.addMark( "matchTime", matchTime )
# Get some extra stuff into the response returned
resOpt = self.jobDB.getJobOptParameters( jobID )
if resOpt['OK']:
for key, value in resOpt['Value'].items():
resultDict[key] = value
resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
if not resAtt['OK']:
raise RuntimeError( 'Could not retrieve job attributes' )
if not resAtt['Value']:
raise RuntimeError( 'No attributes returned for job' )
if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
self.limiter.updateDelayCounters( resourceDict['Site'], jobID )
pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
if not pilotInfoReportedFlag:
self._updatePilotInfo( resourceDict )
self._updatePilotJobMapping( resourceDict, jobID )
resultDict['DN'] = resAtt['Value']['OwnerDN']
resultDict['Group'] = resAtt['Value']['OwnerGroup']
#.........这里部分代码省略.........
示例9: JobCleaningAgent
class JobCleaningAgent( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def initialize( self ):
"""Sets defaults
"""
self.am_setOption( "PollingTime", 60 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
agentTSTypes = self.am_getOption('ProductionTypes', [])
if agentTSTypes:
self.prod_types = agentTSTypes
else:
self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
self.jobByJob = self.am_getOption('JobByJob',True)
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
return S_OK()
def __getAllowedJobTypes( self ):
#Get valid jobTypes
result = self.jobDB.getDistinctJobAttributes( 'JobType' )
if not result[ 'OK' ]:
return result
cleanJobTypes = []
for jobType in result[ 'Value' ]:
if jobType not in self.prod_types:
cleanJobTypes.append( jobType )
self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
return S_OK( cleanJobTypes )
#############################################################################
def execute( self ):
"""The PilotAgent execution method.
"""
#Delete jobs in "Deleted" state
result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
if not result[ 'OK' ]:
return result
#Get all the Job types that can be cleaned
result = self.__getAllowedJobTypes()
if not result[ 'OK' ]:
return result
baseCond = { 'JobType' : result[ 'Value' ] }
# Remove jobs with final status
for status in REMOVE_STATUS_DELAY:
delay = REMOVE_STATUS_DELAY[ status ]
condDict = dict( baseCond )
condDict[ 'Status' ] = status
delTime = str( Time.dateTime() - delay * Time.day )
result = self.removeJobsByStatus( condDict, delTime )
if not result['OK']:
gLogger.warn( 'Failed to remove jobs in status %s' % status )
return S_OK()
def removeJobsByStatus( self, condDict, delay = False ):
""" Remove deleted jobs
"""
if delay:
gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
else:
gLogger.verbose( "Removing jobs with %s " % condDict )
result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )
if not result['OK']:
return result
jobList = result['Value']
if len(jobList) > self.maxJobsAtOnce:
jobList = jobList[:self.maxJobsAtOnce]
if not jobList:
return S_OK()
self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )
count = 0
error_count = 0
result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
if not result[ 'OK' ]:
gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )
result = self.deleteJobOversizedSandbox( jobList )
if not result[ 'OK' ]:
gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] )
return result
#.........这里部分代码省略.........
示例10: OptimizerModule
class OptimizerModule(AgentModule):
"""
The specific agents must provide the following methods:
* initialize() for initial settings
* beginExecution()
* execute() - the main method called in the agent cycle
* endExecution()
* finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def __init__(self, *args, **kwargs):
""" c'tor
"""
AgentModule.__init__(self, *args, **kwargs)
self.jobDB = None
self.logDB = None
self.startingMinorStatus = None
self.startingMajorStatus = "Checking"
self.failedStatus = None
self.requiredJobInfo = 'jdl'
self._initResult = None
def initialize(self, jobDB=None, logDB=None):
""" Initialization of the Optimizer Agent.
"""
self.jobDB = JobDB() if jobDB is None else jobDB
if not self.jobDB.isValid():
dExit(1)
self.logDB = JobLoggingDB() if logDB is None else logDB
optimizerName = self.am_getModuleParam('agentName')
if optimizerName.endswith('Agent'):
optimizerName = optimizerName[:-len('Agent')]
self.am_setModuleParam('optimizerName', optimizerName)
self.startingMinorStatus = self.am_getModuleParam('optimizerName')
self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
self.am_setOption("PollingTime", 30)
return self.initializeOptimizer()
def initializeOptimizer(self):
""" To be overwritten by inheriting class
"""
return S_OK()
#############################################################################
def execute(self):
""" The main agent execution method
"""
result = self.initializeOptimizer()
if not result['OK']:
return result
self._initResult = result['Value']
condition = {'Status': self.startingMajorStatus}
if self.startingMinorStatus:
condition['MinorStatus'] = self.startingMinorStatus
result = self.jobDB.selectJobs(condition)
if not result['OK']:
self.log.warn('Failed to get a job list from the JobDB')
return S_ERROR('Failed to get a job list from the JobDB')
if not result['Value']:
self.log.verbose('No pending jobs to process')
return S_OK('No work to do')
for job in result['Value']:
result = self.getJobDefinition(job)
if not result['OK']:
self.setFailedJob(job, result['Message'], '')
continue
jobDef = result['Value']
result = self.optimizeJob(job, jobDef['classad'])
return S_OK()
#############################################################################
def optimizeJob(self, job, classAdJob):
""" Call the corresponding Optimizer checkJob method
"""
self.log.info('Job %s will be processed by %sAgent' % (job, self.am_getModuleParam('optimizerName')))
result = self.checkJob(job, classAdJob)
if not result['OK']:
self.setFailedJob(job, result['Message'], classAdJob)
return result
#############################################################################
def getJobDefinition(self, job, jobDef=False):
""" Retrieve JDL of the Job and return jobDef dictionary
"""
if not jobDef:
jobDef = {}
# If not jdl in jobinfo load it
#.........这里部分代码省略.........
示例11: StalledJobAgent
class StalledJobAgent( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
jobDB = None
logDB = None
matchedTime = 7200
rescheduledTime = 600
completedTime = 86400
#############################################################################
def initialize( self ):
"""Sets default parameters
"""
self.jobDB = JobDB()
self.logDB = JobLoggingDB()
self.am_setOption( 'PollingTime', 60 * 60 )
if not self.am_getOption( 'Enable', True ):
self.log.info( 'Stalled Job Agent running in disabled mode' )
return S_OK()
#############################################################################
def execute( self ):
""" The main agent execution method
"""
self.log.verbose( 'Waking up Stalled Job Agent' )
wms_instance = getSystemInstance( 'WorkloadManagement' )
if not wms_instance:
return S_ERROR( 'Can not get the WorkloadManagement system instance' )
wrapperSection = cfgPath( 'Systems', 'WorkloadManagement', wms_instance, 'JobWrapper' )
stalledTime = self.am_getOption( 'StalledTimeHours', 2 )
failedTime = self.am_getOption( 'FailedTimeHours', 6 )
self.matchedTime = self.am_getOption( 'MatchedTime', self.matchedTime )
self.rescheduledTime = self.am_getOption( 'RescheduledTime', self.rescheduledTime )
self.completedTime = self.am_getOption( 'CompletedTime', self.completedTime )
self.log.verbose( 'StalledTime = %s cycles' % ( stalledTime ) )
self.log.verbose( 'FailedTime = %s cycles' % ( failedTime ) )
watchdogCycle = gConfig.getValue( cfgPath( wrapperSection , 'CheckingTime' ), 30 * 60 )
watchdogCycle = max( watchdogCycle, gConfig.getValue( cfgPath( wrapperSection , 'MinCheckingTime' ), 20 * 60 ) )
# Add half cycle to avoid race conditions
stalledTime = watchdogCycle * ( stalledTime + 0.5 )
failedTime = watchdogCycle * ( failedTime + 0.5 )
result = self.__markStalledJobs( stalledTime )
if not result['OK']:
self.log.error( 'Failed to detect stalled jobs', result['Message'] )
#Note, jobs will be revived automatically during the heartbeat signal phase and
#subsequent status changes will result in jobs not being selected by the
#stalled job agent.
result = self.__failStalledJobs( failedTime )
if not result['OK']:
self.log.error( 'Failed to process stalled jobs', result['Message'] )
result = self.__failCompletedJobs()
if not result['OK']:
self.log.error( 'Failed to process completed jobs', result['Message'] )
result = self.__kickStuckJobs()
if not result['OK']:
self.log.error( 'Failed to kick stuck jobs', result['Message'] )
return S_OK( 'Stalled Job Agent cycle complete' )
#############################################################################
def __markStalledJobs( self, stalledTime ):
""" Identifies stalled jobs running without update longer than stalledTime.
"""
stalledCounter = 0
runningCounter = 0
result = self.jobDB.selectJobs( {'Status':'Running'} )
if not result['OK']:
return result
if not result['Value']:
return S_OK()
jobs = result['Value']
self.log.info( '%s Running jobs will be checked for being stalled' % ( len( jobs ) ) )
jobs.sort()
# jobs = jobs[:10] #for debugging
for job in jobs:
result = self.__getStalledJob( job, stalledTime )
if result['OK']:
self.log.verbose( 'Updating status to Stalled for job %s' % ( job ) )
self.__updateJobStatus( job, 'Stalled' )
stalledCounter += 1
else:
self.log.verbose( result['Message'] )
#.........这里部分代码省略.........
示例12: JobCleaningAgent
class JobCleaningAgent( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def initialize( self ):
"""Sets defaults
"""
self.am_setOption( "PollingTime", 60 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production'])
gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
self.jobByJob = self.am_getOption('JobByJob',True)
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
return S_OK()
def __getAllowedJobTypes( self ):
#Get valid jobTypes
result = self.jobDB.getDistinctJobAttributes( 'JobType' )
if not result[ 'OK' ]:
return result
cleanJobTypes = []
for jobType in result[ 'Value' ]:
if jobType not in self.prod_types:
cleanJobTypes.append( jobType )
self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
return S_OK( cleanJobTypes )
#############################################################################
def execute( self ):
"""The PilotAgent execution method.
"""
#Delete jobs in "Deleted" state
result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
if not result[ 'OK' ]:
return result
#Get all the Job types that can be cleaned
result = self.__getAllowedJobTypes()
if not result[ 'OK' ]:
return result
baseCond = { 'JobType' : result[ 'Value' ] }
# Remove jobs with final status
for status in REMOVE_STATUS_DELAY:
delay = REMOVE_STATUS_DELAY[ status ]
condDict = dict( baseCond )
condDict[ 'Status' ] = status
delTime = str( Time.dateTime() - delay * Time.day )
result = self.removeJobsByStatus( condDict, delTime )
if not result['OK']:
gLogger.warn( 'Failed to remove jobs in status %s' % status )
return S_OK()
def removeJobsByStatus( self, condDict, delay = False ):
""" Remove deleted jobs
"""
if delay:
gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
else:
gLogger.verbose( "Removing jobs with %s " % condDict )
result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )
if not result['OK']:
return result
jobList = result['Value']
if len(jobList) > self.maxJobsAtOnce:
jobList = jobList[:self.maxJobsAtOnce]
if not jobList:
return S_OK()
self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )
count = 0
error_count = 0
result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
if not result[ 'OK' ]:
gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )
if self.jobByJob:
for jobID in jobList:
resultJobDB = self.jobDB.removeJobFromDB( jobID )
resultTQ = self.taskQueueDB.deleteJob( jobID )
resultLogDB = self.jobLoggingDB.deleteJob( jobID )
errorFlag = False
if not resultJobDB['OK']:
gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
errorFlag = True
#.........这里部分代码省略.........
示例13: StalledJobAgent
class StalledJobAgent(AgentModule):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
jobDB = None
logDB = None
matchedTime = 7200
rescheduledTime = 600
completedTime = 86400
#############################################################################
def initialize(self):
"""Sets default parameters
"""
self.jobDB = JobDB()
self.logDB = JobLoggingDB()
self.am_setOption("PollingTime", 60 * 60)
if not self.am_getOption("Enable", True):
self.log.info("Stalled Job Agent running in disabled mode")
return S_OK()
#############################################################################
def execute(self):
""" The main agent execution method
"""
self.log.verbose("Waking up Stalled Job Agent")
wms_instance = getSystemInstance("WorkloadManagement")
if not wms_instance:
return S_ERROR("Can not get the WorkloadManagement system instance")
wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper")
stalledTime = self.am_getOption("StalledTimeHours", 2)
failedTime = self.am_getOption("FailedTimeHours", 6)
self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime)
self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime)
self.completedTime = self.am_getOption("CompletedTime", self.completedTime)
self.log.verbose("StalledTime = %s cycles" % (stalledTime))
self.log.verbose("FailedTime = %s cycles" % (failedTime))
watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60)
watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60))
# Add half cycle to avoid race conditions
stalledTime = watchdogCycle * (stalledTime + 0.5)
failedTime = watchdogCycle * (failedTime + 0.5)
result = self.__markStalledJobs(stalledTime)
if not result["OK"]:
self.log.error("Failed to detect stalled jobs", result["Message"])
# Note, jobs will be revived automatically during the heartbeat signal phase and
# subsequent status changes will result in jobs not being selected by the
# stalled job agent.
result = self.__failStalledJobs(failedTime)
if not result["OK"]:
self.log.error("Failed to process stalled jobs", result["Message"])
result = self.__failCompletedJobs()
if not result["OK"]:
self.log.error("Failed to process completed jobs", result["Message"])
result = self.__kickStuckJobs()
if not result["OK"]:
self.log.error("Failed to kick stuck jobs", result["Message"])
return S_OK("Stalled Job Agent cycle complete")
#############################################################################
def __markStalledJobs(self, stalledTime):
""" Identifies stalled jobs running without update longer than stalledTime.
"""
stalledCounter = 0
runningCounter = 0
result = self.jobDB.selectJobs({"Status": "Running"})
if not result["OK"]:
return result
if not result["Value"]:
return S_OK()
jobs = result["Value"]
self.log.info("%s Running jobs will be checked for being stalled" % (len(jobs)))
jobs.sort()
# jobs = jobs[:10] #for debugging
for job in jobs:
result = self.__getStalledJob(job, stalledTime)
if result["OK"]:
self.log.verbose("Updating status to Stalled for job %s" % (job))
self.__updateJobStatus(job, "Stalled")
stalledCounter += 1
else:
#.........这里部分代码省略.........
示例14: JobCleaningAgent
class JobCleaningAgent( AgentModule ):
"""
The specific agents must provide the following methods:
* initialize() for initial settings
* beginExecution()
* execute() - the main method called in the agent cycle
* endExecution()
* finalize() - the graceful exit of the method, this one is usually used for the agent restart
"""
def __init__( self, *args, **kwargs ):
""" c'tor
"""
AgentModule.__init__( self, *args, **kwargs )
#clients
# FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
self.jobDB = None
self.taskQueueDB = None
self.jobLoggingDB = None
self.maxJobsAtOnce = 100
self.jobByJob = False
self.throttlingPeriod = 0.
self.prodTypes = []
self.removeStatusDelay = {}
#############################################################################
def initialize( self ):
""" Sets defaults
"""
self.am_setOption( "PollingTime", 120 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
agentTSTypes = self.am_getOption('ProductionTypes', [])
if agentTSTypes:
self.prodTypes = agentTSTypes
else:
self.prodTypes = Operations().getValue(
'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
gLogger.info("Will exclude the following Production types from cleaning %s" % (
', '.join(self.prodTypes)))
self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
self.jobByJob = self.am_getOption( 'JobByJob', False )
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )
return S_OK()
def __getAllowedJobTypes( self ):
""" Get valid jobTypes
"""
result = self.jobDB.getDistinctJobAttributes( 'JobType' )
if not result[ 'OK' ]:
return result
cleanJobTypes = []
for jobType in result[ 'Value' ]:
if jobType not in self.prodTypes:
cleanJobTypes.append( jobType )
self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
return S_OK( cleanJobTypes )
#############################################################################
def execute( self ):
""" Remove jobs in various status
"""
#Delete jobs in "Deleted" state
result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
if not result[ 'OK' ]:
return result
#Get all the Job types that can be cleaned
result = self.__getAllowedJobTypes()
if not result[ 'OK' ]:
return result
# No jobs in the system subject to removal
if not result['Value']:
return S_OK()
baseCond = { 'JobType' : result[ 'Value' ] }
# Remove jobs with final status
for status in self.removeStatusDelay:
delay = self.removeStatusDelay[ status ]
if delay < 0:
# Negative delay means don't delete anything...
continue
condDict = dict( baseCond )
if status != 'Any':
condDict[ 'Status' ] = status
delTime = str( Time.dateTime() - delay * Time.day )
#.........这里部分代码省略.........