本文整理汇总了Python中DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB.selectJobs方法的典型用法代码示例。如果您正苦于以下问题:Python JobDB.selectJobs方法的具体用法?Python JobDB.selectJobs怎么用?Python JobDB.selectJobs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB
的用法示例。
在下文中一共展示了JobDB.selectJobs方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: JobDBTestCase
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class JobDBTestCase( unittest.TestCase ):
""" Base class for the JobDB test cases
"""
def setUp( self ):
gLogger.setLevel( 'DEBUG' )
self.jobDB = JobDB()
def tearDown( self ):
result = self.jobDB.selectJobs( {} )
self.assert_( result['OK'], 'Status after selectJobs' )
jobs = result['Value']
for job in jobs:
result = self.jobDB.removeJobFromDB( job )
self.assert_( result['OK'] )
示例2: JobCleaningAgent
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class JobCleaningAgent( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def initialize( self ):
"""Sets defaults
"""
self.am_setOption( "PollingTime", 60 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
agentTSTypes = self.am_getOption('ProductionTypes', [])
if agentTSTypes:
self.prod_types = agentTSTypes
else:
self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
self.jobByJob = self.am_getOption('JobByJob',True)
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
return S_OK()
def __getAllowedJobTypes( self ):
#Get valid jobTypes
result = self.jobDB.getDistinctJobAttributes( 'JobType' )
if not result[ 'OK' ]:
return result
cleanJobTypes = []
for jobType in result[ 'Value' ]:
if jobType not in self.prod_types:
cleanJobTypes.append( jobType )
self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
return S_OK( cleanJobTypes )
#############################################################################
def execute( self ):
"""The PilotAgent execution method.
"""
#Delete jobs in "Deleted" state
result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
if not result[ 'OK' ]:
return result
#Get all the Job types that can be cleaned
result = self.__getAllowedJobTypes()
if not result[ 'OK' ]:
return result
baseCond = { 'JobType' : result[ 'Value' ] }
# Remove jobs with final status
for status in REMOVE_STATUS_DELAY:
delay = REMOVE_STATUS_DELAY[ status ]
condDict = dict( baseCond )
condDict[ 'Status' ] = status
delTime = str( Time.dateTime() - delay * Time.day )
result = self.removeJobsByStatus( condDict, delTime )
if not result['OK']:
gLogger.warn( 'Failed to remove jobs in status %s' % status )
return S_OK()
def removeJobsByStatus( self, condDict, delay = False ):
""" Remove deleted jobs
"""
if delay:
gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
else:
gLogger.verbose( "Removing jobs with %s " % condDict )
result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )
if not result['OK']:
return result
jobList = result['Value']
if len(jobList) > self.maxJobsAtOnce:
jobList = jobList[:self.maxJobsAtOnce]
if not jobList:
return S_OK()
self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )
count = 0
error_count = 0
result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
if not result[ 'OK' ]:
gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )
result = self.deleteJobOversizedSandbox( jobList )
if not result[ 'OK' ]:
gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] )
return result
#.........这里部分代码省略.........
示例3: OptimizerModule
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class OptimizerModule(AgentModule):
"""
The specific agents must provide the following methods:
* initialize() for initial settings
* beginExecution()
* execute() - the main method called in the agent cycle
* endExecution()
* finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def __init__(self, *args, **kwargs):
""" c'tor
"""
AgentModule.__init__(self, *args, **kwargs)
self.jobDB = None
self.logDB = None
self.startingMinorStatus = None
self.startingMajorStatus = "Checking"
self.failedStatus = None
self.requiredJobInfo = 'jdl'
self._initResult = None
def initialize(self, jobDB=None, logDB=None):
""" Initialization of the Optimizer Agent.
"""
self.jobDB = JobDB() if jobDB is None else jobDB
if not self.jobDB.isValid():
dExit(1)
self.logDB = JobLoggingDB() if logDB is None else logDB
optimizerName = self.am_getModuleParam('agentName')
if optimizerName.endswith('Agent'):
optimizerName = optimizerName[:-len('Agent')]
self.am_setModuleParam('optimizerName', optimizerName)
self.startingMinorStatus = self.am_getModuleParam('optimizerName')
self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
self.am_setOption("PollingTime", 30)
return self.initializeOptimizer()
def initializeOptimizer(self):
""" To be overwritten by inheriting class
"""
return S_OK()
#############################################################################
def execute(self):
""" The main agent execution method
"""
result = self.initializeOptimizer()
if not result['OK']:
return result
self._initResult = result['Value']
condition = {'Status': self.startingMajorStatus}
if self.startingMinorStatus:
condition['MinorStatus'] = self.startingMinorStatus
result = self.jobDB.selectJobs(condition)
if not result['OK']:
self.log.warn('Failed to get a job list from the JobDB')
return S_ERROR('Failed to get a job list from the JobDB')
if not result['Value']:
self.log.verbose('No pending jobs to process')
return S_OK('No work to do')
for job in result['Value']:
result = self.getJobDefinition(job)
if not result['OK']:
self.setFailedJob(job, result['Message'], '')
continue
jobDef = result['Value']
result = self.optimizeJob(job, jobDef['classad'])
return S_OK()
#############################################################################
def optimizeJob(self, job, classAdJob):
""" Call the corresponding Optimizer checkJob method
"""
self.log.info('Job %s will be processed by %sAgent' % (job, self.am_getModuleParam('optimizerName')))
result = self.checkJob(job, classAdJob)
if not result['OK']:
self.setFailedJob(job, result['Message'], classAdJob)
return result
#############################################################################
def getJobDefinition(self, job, jobDef=False):
""" Retrieve JDL of the Job and return jobDef dictionary
"""
if not jobDef:
jobDef = {}
# If not jdl in jobinfo load it
#.........这里部分代码省略.........
示例4: ThreadedMightyOptimizer
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class ThreadedMightyOptimizer(AgentModule):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
__jobStates = ["Received", "Checking"]
__defaultValidOptimizers = [
"WorkloadManagement/JobPath",
"WorkloadManagement/JobSanity",
"WorkloadManagement/JobScheduling",
"WorkloadManagement/TaskQueue",
]
def initialize(self):
""" Standard constructor
"""
self.jobDB = JobDB()
self.jobLoggingDB = JobLoggingDB()
self._optimizingJobs = JobsInTheWorks()
self._optimizers = {}
self._threadedOptimizers = {}
self.am_setOption("PollingTime", 30)
return S_OK()
def execute(self):
""" Standard Agent module execute method
"""
# Get jobs from DB
result = self.jobDB.selectJobs({"Status": self.__jobStates})
if not result["OK"]:
gLogger.error("Cannot retrieve jobs in states %s" % self.__jobStates)
return result
jobsList = result["Value"]
for i in range(len(jobsList)):
jobsList[i] = int(jobsList[i])
jobsList.sort()
self.log.info("Got %s jobs for this iteration" % len(jobsList))
if not jobsList:
return S_OK()
# Check jobs that are already being optimized
newJobsList = self._optimizingJobs.addJobs(jobsList)
if not newJobsList:
return S_OK()
# Get attrs of jobs to be optimized
result = self.jobDB.getAttributesForJobList(newJobsList)
if not result["OK"]:
gLogger.error("Cannot retrieve attributes for %s jobs %s" % len(newJobsList))
return result
jobsToProcess = result["Value"]
for jobId in jobsToProcess:
self.log.info("== Processing job %s == " % jobId)
jobAttrs = jobsToProcess[jobId]
result = self.__dispatchJob(jobId, jobAttrs, False)
if not result["OK"]:
gLogger.error("There was a problem optimizing job", "JID %s: %s" % (jobId, result["Message"]))
return S_OK()
def __dispatchJob(self, jobId, jobAttrs, jobDef, keepOptimizing=True):
""" Decide what to do with the Job
"""
returnValue = S_OK()
if keepOptimizing:
result = self.__sendJobToOptimizer(jobId, jobAttrs, jobDef)
if result["OK"] and result["Value"]:
return S_OK()
if not result["OK"]:
returnValue = result
gLogger.error(
"Could not send job to optimizer\n", "\tJob: %s\n\Message: %s" % (jobId, result["Message"])
)
self._optimizingJobs.deleteJob(jobId)
return returnValue
def __sendJobToOptimizer(self, jobId, jobAttrs, jobDef):
""" Send Job to Optimizer queue
"""
optimizerName = self.__getNextOptimizerName(jobAttrs)
if not optimizerName:
return S_OK(False)
if optimizerName not in self.am_getOption("ValidOptimizers", self.__defaultValidOptimizers):
return S_OK(False)
if optimizerName not in self._threadedOptimizers:
to = ThreadedOptimizer(optimizerName, self.am_getModuleParam("fullName"), self.__dispatchJob)
result = to.initialize(self.jobDB, self.jobLoggingDB)
if not result["OK"]:
return S_OK(False)
self._threadedOptimizers[optimizerName] = to
self._threadedOptimizers[optimizerName].optimizeJob(jobId, jobAttrs, jobDef)
return S_OK(True)
def __getNextOptimizerName(self, jobAttrs):
""" Determine next Optimizer
"""
if jobAttrs["Status"] == "Received":
#.........这里部分代码省略.........
示例5: StalledJobAgent
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class StalledJobAgent( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
jobDB = None
logDB = None
matchedTime = 7200
rescheduledTime = 600
completedTime = 86400
#############################################################################
def initialize( self ):
"""Sets default parameters
"""
self.jobDB = JobDB()
self.logDB = JobLoggingDB()
self.am_setOption( 'PollingTime', 60 * 60 )
if not self.am_getOption( 'Enable', True ):
self.log.info( 'Stalled Job Agent running in disabled mode' )
return S_OK()
#############################################################################
def execute( self ):
""" The main agent execution method
"""
self.log.verbose( 'Waking up Stalled Job Agent' )
wms_instance = getSystemInstance( 'WorkloadManagement' )
if not wms_instance:
return S_ERROR( 'Can not get the WorkloadManagement system instance' )
wrapperSection = cfgPath( 'Systems', 'WorkloadManagement', wms_instance, 'JobWrapper' )
stalledTime = self.am_getOption( 'StalledTimeHours', 2 )
failedTime = self.am_getOption( 'FailedTimeHours', 6 )
self.matchedTime = self.am_getOption( 'MatchedTime', self.matchedTime )
self.rescheduledTime = self.am_getOption( 'RescheduledTime', self.rescheduledTime )
self.completedTime = self.am_getOption( 'CompletedTime', self.completedTime )
self.log.verbose( 'StalledTime = %s cycles' % ( stalledTime ) )
self.log.verbose( 'FailedTime = %s cycles' % ( failedTime ) )
watchdogCycle = gConfig.getValue( cfgPath( wrapperSection , 'CheckingTime' ), 30 * 60 )
watchdogCycle = max( watchdogCycle, gConfig.getValue( cfgPath( wrapperSection , 'MinCheckingTime' ), 20 * 60 ) )
# Add half cycle to avoid race conditions
stalledTime = watchdogCycle * ( stalledTime + 0.5 )
failedTime = watchdogCycle * ( failedTime + 0.5 )
result = self.__markStalledJobs( stalledTime )
if not result['OK']:
self.log.error( 'Failed to detect stalled jobs', result['Message'] )
#Note, jobs will be revived automatically during the heartbeat signal phase and
#subsequent status changes will result in jobs not being selected by the
#stalled job agent.
result = self.__failStalledJobs( failedTime )
if not result['OK']:
self.log.error( 'Failed to process stalled jobs', result['Message'] )
result = self.__failCompletedJobs()
if not result['OK']:
self.log.error( 'Failed to process completed jobs', result['Message'] )
result = self.__kickStuckJobs()
if not result['OK']:
self.log.error( 'Failed to kick stuck jobs', result['Message'] )
return S_OK( 'Stalled Job Agent cycle complete' )
#############################################################################
def __markStalledJobs( self, stalledTime ):
""" Identifies stalled jobs running without update longer than stalledTime.
"""
stalledCounter = 0
runningCounter = 0
result = self.jobDB.selectJobs( {'Status':'Running'} )
if not result['OK']:
return result
if not result['Value']:
return S_OK()
jobs = result['Value']
self.log.info( '%s Running jobs will be checked for being stalled' % ( len( jobs ) ) )
jobs.sort()
# jobs = jobs[:10] #for debugging
for job in jobs:
result = self.__getStalledJob( job, stalledTime )
if result['OK']:
self.log.verbose( 'Updating status to Stalled for job %s' % ( job ) )
self.__updateJobStatus( job, 'Stalled' )
stalledCounter += 1
else:
self.log.verbose( result['Message'] )
#.........这里部分代码省略.........
示例6: OptimizerModule
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class OptimizerModule( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def initialize( self, jobDB = False, logDB = False ):
""" Initialization of the Optimizer Agent.
"""
if not jobDB:
self.jobDB = JobDB()
else:
self.jobDB = jobDB
if not logDB:
self.logDB = JobLoggingDB()
else:
self.logDB = logDB
trailing = "Agent"
optimizerName = self.am_getModuleParam( 'agentName' )
if optimizerName[ -len( trailing ):].find( trailing ) == 0:
optimizerName = optimizerName[ :-len( trailing ) ]
self.am_setModuleParam( 'optimizerName', optimizerName )
self.startingMinorStatus = self.am_getModuleParam( 'optimizerName' )
self.startingMajorStatus = "Checking"
self.failedStatus = self.am_getOption( "FailedJobStatus" , 'Failed' )
self.requiredJobInfo = 'jdl'
self.am_setOption( "PollingTime", 30 )
return self.initializeOptimizer()
def initializeOptimizer( self ):
""" To be overwritten by inheriting class
"""
return S_OK()
#############################################################################
def execute( self ):
""" The main agent execution method
"""
result = self.initializeOptimizer()
if not result[ 'OK' ]:
return result
self._initResult = result[ 'Value' ]
condition = { 'Status' : self.startingMajorStatus }
if self.startingMinorStatus:
condition[ 'MinorStatus' ] = self.startingMinorStatus
result = self.jobDB.selectJobs( condition )
if not result['OK']:
self.log.warn( 'Failed to get a job list from the JobDB' )
return S_ERROR( 'Failed to get a job list from the JobDB' )
if not len( result['Value'] ):
self.log.verbose( 'No pending jobs to process' )
return S_OK( 'No work to do' )
for job in result['Value']:
result = self.getJobDefinition( job )
if not result['OK']:
self.setFailedJob( job, result[ 'Message' ], '' )
continue
jobDef = result[ 'Value' ]
result = self.optimizeJob( job, jobDef[ 'classad' ] )
return S_OK()
#############################################################################
def optimizeJob( self, job, classAdJob ):
""" Call the corresponding Optimizer checkJob method
"""
self.log.info( 'Job %s will be processed by %sAgent' % ( job, self.am_getModuleParam( 'optimizerName' ) ) )
result = self.checkJob( job, classAdJob )
if not result['OK']:
self.setFailedJob( job, result['Message'], classAdJob )
return result
#############################################################################
def getJobDefinition( self, job, jobDef = False ):
""" Retrieve JDL of the Job and return jobDef dictionary
"""
if jobDef == False:
jobDef = {}
#If not jdl in jobinfo load it
if 'jdl' not in jobDef:
if 'jdlOriginal' == self.requiredJobInfo:
result = self.jobDB.getJobJDL( job, original = True )
if not result[ 'OK' ]:
self.log.error( "No JDL for job", "%s" % job )
return S_ERROR( "No JDL for job" )
jobDef[ 'jdl' ] = result[ 'Value' ]
#.........这里部分代码省略.........
示例7: MightyOptimizer
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class MightyOptimizer( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
__jobStates = [ 'Received', 'Checking' ]
def initialize( self ):
""" Standard constructor
"""
self.jobDB = JobDB()
self.jobLoggingDB = JobLoggingDB()
self._optimizers = {}
self.am_setOption( "PollingTime", 30 )
return S_OK()
def execute( self ):
""" The method call by AgentModule on each iteration
"""
jobTypeCondition = self.am_getOption( "JobTypeRestriction", [] )
jobCond = { 'Status': self.__jobStates }
if jobTypeCondition:
jobCond[ 'JobType' ] = jobTypeCondition
result = self.jobDB.selectJobs( jobCond )
if not result[ 'OK' ]:
return result
jobsList = result[ 'Value' ]
self.log.info( "Got %s jobs for this iteration" % len( jobsList ) )
if not jobsList:
return S_OK()
result = self.jobDB.getAttributesForJobList( jobsList )
if not result[ 'OK' ]:
return result
jobsToProcess = result[ 'Value' ]
for jobId in jobsToProcess:
self.log.info( "== Processing job %s == " % jobId )
jobAttrs = jobsToProcess[ jobId ]
jobDef = False
jobOptimized = False
jobOK = True
while not jobOptimized:
result = self.optimizeJob( jobId, jobAttrs, jobDef )
if not result[ 'OK' ]:
self.log.error( "Optimizer %s error" % jobAttrs[ 'MinorStatus' ], "Job %s: %s" % ( str(jobID), result[ 'Message' ] ) )
jobOK = False
break
optResult = result[ 'Value' ]
jobOptimized = optResult[ 'done' ]
if 'jobDef' in optResult:
jobDef = optResult[ 'jobDef' ]
if jobOK:
self.log.info( "Finished optimizing job %s" % jobId )
return S_OK()
def optimizeJob( self, jobId, jobAttrs, jobDef ):
""" The method call for each Job to be optimized
"""
#Get the next optimizer
result = self._getNextOptimizer( jobAttrs )
if not result[ 'OK' ]:
return result
optimizer = result[ 'Value' ]
if not optimizer:
return S_OK( { 'done' : True } )
#If there's no job def then get it
if not jobDef:
result = optimizer.getJobDefinition( jobId, jobDef )
if not result['OK']:
optimizer.setFailedJob( jobId, result[ 'Message' ] )
return result
jobDef = result[ 'Value' ]
#Does the optimizer require a proxy?
shifterEnv = False
if optimizer.am_getModuleParam( 'shifterProxy' ):
shifterEnv = True
result = setupShifterProxyInEnv( optimizer.am_getModuleParam( 'shifterProxy' ),
optimizer.am_getShifterProxyLocation() )
if not result[ 'OK' ]:
return result
#Call the initCycle function
result = self.am_secureCall( optimizer.beginExecution, name = "beginExecution" )
if not result[ 'OK' ]:
return result
#Do the work
result = optimizer.optimizeJob( jobId, jobDef[ 'classad' ] )
if not result[ 'OK' ]:
return result
nextOptimizer = result[ 'Value' ]
#If there was a shifter proxy, unset it
if shifterEnv:
del( os.environ[ 'X509_USER_PROXY' ] )
#Check if the JDL has changed
newJDL = jobDef[ 'classad' ].asJDL()
if newJDL != jobDef[ 'jdl' ]:
#.........这里部分代码省略.........
示例8: JobCleaningAgent
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class JobCleaningAgent(AgentModule):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def initialize(self):
"""Sets defaults
"""
self.am_setOption("PollingTime", 60)
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
self.prod_types = self.am_getOption(
"ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"]
)
gLogger.info(
"Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", "))
)
self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200)
self.jobByJob = self.am_getOption("JobByJob", True)
self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0)
return S_OK()
def __getAllowedJobTypes(self):
# Get valid jobTypes
result = self.jobDB.getDistinctJobAttributes("JobType")
if not result["OK"]:
return result
cleanJobTypes = []
for jobType in result["Value"]:
if jobType not in self.prod_types:
cleanJobTypes.append(jobType)
self.log.notice("JobTypes to clean %s" % cleanJobTypes)
return S_OK(cleanJobTypes)
#############################################################################
def execute(self):
"""The PilotAgent execution method.
"""
# Delete jobs in "Deleted" state
result = self.removeJobsByStatus({"Status": "Deleted"})
if not result["OK"]:
return result
# Get all the Job types that can be cleaned
result = self.__getAllowedJobTypes()
if not result["OK"]:
return result
baseCond = {"JobType": result["Value"]}
# Remove jobs with final status
for status in REMOVE_STATUS_DELAY:
delay = REMOVE_STATUS_DELAY[status]
condDict = dict(baseCond)
condDict["Status"] = status
delTime = str(Time.dateTime() - delay * Time.day)
result = self.removeJobsByStatus(condDict, delTime)
if not result["OK"]:
gLogger.warn("Failed to remove jobs in status %s" % status)
return S_OK()
def removeJobsByStatus(self, condDict, delay=False):
""" Remove deleted jobs
"""
if delay:
gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay))
result = self.jobDB.selectJobs(condDict, older=delay)
else:
gLogger.verbose("Removing jobs with %s " % condDict)
result = self.jobDB.selectJobs(condDict)
if not result["OK"]:
return result
jobList = result["Value"]
if len(jobList) > self.maxJobsAtOnce:
jobList = jobList[: self.maxJobsAtOnce]
if not jobList:
return S_OK()
self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict))
count = 0
error_count = 0
result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList)
if not result["OK"]:
gLogger.warn("Cannot unassign jobs to sandboxes", result["Message"])
if self.jobByJob:
for jobID in jobList:
resultJobDB = self.jobDB.removeJobFromDB(jobID)
resultTQ = self.taskQueueDB.deleteJob(jobID)
if not resultJobDB["OK"]:
gLogger.warn("Failed to remove job %d from JobDB" % jobID, result["Message"])
#.........这里部分代码省略.........
示例9: JobDB
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
########################################################################
__RCSID__ = "cd6b25c (2010-12-04 11:45:50 +0000) Ricardo Graciani <[email protected]>"
import sys
import DIRAC
from DIRAC.Core.Base import Script
Script.parseCommandLine( ignoreErrors = True )
args = Script.getPositionalArgs()
from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
jobdb = JobDB()
tqdb = TaskQueueDB()
result = jobdb.selectJobs( { 'Status' : [ 'Received', 'Checking', 'Waiting' ] } )
if not result[ 'OK' ]:
print result[ 'Message' ]
sys.exit( 1 )
jobList = result[ 'Value' ]
print tqdb.forceRecreationOfTables()
for job in jobList:
result = jobdb.getJobAttribute( job, 'RescheduleCounter' )
if not result[ 'OK' ]:
print "Cannot get reschedule counter for job %s" % job
rC = 0
rC = result[ 'Value' ]
if rC >= jobdb.maxRescheduling:
jobdb.setJobAttribute( job, "RescheduleCounter", "0" )
jobdb.rescheduleJob( job )
jobdb.setJobAttribute( job, "RescheduleCounter", rC )
示例10: StalledJobAgent
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class StalledJobAgent(AgentModule):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
jobDB = None
logDB = None
matchedTime = 7200
rescheduledTime = 600
completedTime = 86400
#############################################################################
def initialize(self):
"""Sets default parameters
"""
self.jobDB = JobDB()
self.logDB = JobLoggingDB()
self.am_setOption("PollingTime", 60 * 60)
if not self.am_getOption("Enable", True):
self.log.info("Stalled Job Agent running in disabled mode")
return S_OK()
#############################################################################
def execute(self):
""" The main agent execution method
"""
self.log.verbose("Waking up Stalled Job Agent")
wms_instance = getSystemInstance("WorkloadManagement")
if not wms_instance:
return S_ERROR("Can not get the WorkloadManagement system instance")
wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper")
stalledTime = self.am_getOption("StalledTimeHours", 2)
failedTime = self.am_getOption("FailedTimeHours", 6)
self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime)
self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime)
self.completedTime = self.am_getOption("CompletedTime", self.completedTime)
self.log.verbose("StalledTime = %s cycles" % (stalledTime))
self.log.verbose("FailedTime = %s cycles" % (failedTime))
watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60)
watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60))
# Add half cycle to avoid race conditions
stalledTime = watchdogCycle * (stalledTime + 0.5)
failedTime = watchdogCycle * (failedTime + 0.5)
result = self.__markStalledJobs(stalledTime)
if not result["OK"]:
self.log.error("Failed to detect stalled jobs", result["Message"])
# Note, jobs will be revived automatically during the heartbeat signal phase and
# subsequent status changes will result in jobs not being selected by the
# stalled job agent.
result = self.__failStalledJobs(failedTime)
if not result["OK"]:
self.log.error("Failed to process stalled jobs", result["Message"])
result = self.__failCompletedJobs()
if not result["OK"]:
self.log.error("Failed to process completed jobs", result["Message"])
result = self.__kickStuckJobs()
if not result["OK"]:
self.log.error("Failed to kick stuck jobs", result["Message"])
return S_OK("Stalled Job Agent cycle complete")
#############################################################################
def __markStalledJobs(self, stalledTime):
""" Identifies stalled jobs running without update longer than stalledTime.
"""
stalledCounter = 0
runningCounter = 0
result = self.jobDB.selectJobs({"Status": "Running"})
if not result["OK"]:
return result
if not result["Value"]:
return S_OK()
jobs = result["Value"]
self.log.info("%s Running jobs will be checked for being stalled" % (len(jobs)))
jobs.sort()
# jobs = jobs[:10] #for debugging
for job in jobs:
result = self.__getStalledJob(job, stalledTime)
if result["OK"]:
self.log.verbose("Updating status to Stalled for job %s" % (job))
self.__updateJobStatus(job, "Stalled")
stalledCounter += 1
else:
#.........这里部分代码省略.........
示例11: JobCleaningAgent
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
#.........这里部分代码省略.........
result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
if not result[ 'OK' ]:
return result
#Get all the Job types that can be cleaned
result = self.__getAllowedJobTypes()
if not result[ 'OK' ]:
return result
# No jobs in the system subject to removal
if not result['Value']:
return S_OK()
baseCond = { 'JobType' : result[ 'Value' ] }
# Remove jobs with final status
for status in self.removeStatusDelay:
delay = self.removeStatusDelay[ status ]
if delay < 0:
# Negative delay means don't delete anything...
continue
condDict = dict( baseCond )
if status != 'Any':
condDict[ 'Status' ] = status
delTime = str( Time.dateTime() - delay * Time.day )
result = self.removeJobsByStatus( condDict, delTime )
if not result['OK']:
gLogger.warn( 'Failed to remove jobs in status %s' % status )
return S_OK()
def removeJobsByStatus( self, condDict, delay = False ):
""" Remove deleted jobs
"""
if delay:
gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) )
result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
else:
gLogger.verbose( "Removing jobs with %s " % condDict )
result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )
if not result['OK']:
return result
jobList = result['Value']
if len(jobList) > self.maxJobsAtOnce:
jobList = jobList[:self.maxJobsAtOnce]
if not jobList:
return S_OK()
self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )
count = 0
error_count = 0
result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
if not result[ 'OK' ]:
gLogger.error("Cannot unassign jobs to sandboxes", result['Message'])
return result
result = self.deleteJobOversizedSandbox(jobList)
if not result[ 'OK' ]:
gLogger.error(
"Cannot schedule removal of oversized sandboxes", result['Message'])
return result
failedJobs = result['Value']['Failed']
for job in failedJobs:
jobList.pop(jobList.index(job))
示例12: JobCleaningAgent
# 需要导入模块: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB [as 别名]
# 或者: from DIRAC.WorkloadManagementSystem.DB.JobDB.JobDB import selectJobs [as 别名]
class JobCleaningAgent( AgentModule ):
"""
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
#############################################################################
def initialize( self ):
"""Sets defaults
"""
self.am_setOption( "PollingTime", 60 )
self.jobDB = JobDB()
self.taskQueueDB = TaskQueueDB()
self.jobLoggingDB = JobLoggingDB()
# self.sandboxDB = SandboxDB( 'SandboxDB' )
agentTSTypes = self.am_getOption('ProductionTypes', [])
if agentTSTypes:
self.prod_types = agentTSTypes
else:
self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
self.jobByJob = self.am_getOption('JobByJob',True)
self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
return S_OK()
def __getAllowedJobTypes( self ):
#Get valid jobTypes
result = self.jobDB.getDistinctJobAttributes( 'JobType' )
if not result[ 'OK' ]:
return result
cleanJobTypes = []
for jobType in result[ 'Value' ]:
if jobType not in self.prod_types:
cleanJobTypes.append( jobType )
self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
return S_OK( cleanJobTypes )
#############################################################################
def execute( self ):
"""The PilotAgent execution method.
"""
#Delete jobs in "Deleted" state
result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
if not result[ 'OK' ]:
return result
#Get all the Job types that can be cleaned
result = self.__getAllowedJobTypes()
if not result[ 'OK' ]:
return result
baseCond = { 'JobType' : result[ 'Value' ] }
# Remove jobs with final status
for status in REMOVE_STATUS_DELAY:
delay = REMOVE_STATUS_DELAY[ status ]
condDict = dict( baseCond )
condDict[ 'Status' ] = status
delTime = str( Time.dateTime() - delay * Time.day )
result = self.removeJobsByStatus( condDict, delTime )
if not result['OK']:
gLogger.warn( 'Failed to remove jobs in status %s' % status )
return S_OK()
def removeJobsByStatus( self, condDict, delay = False ):
""" Remove deleted jobs
"""
if delay:
gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
else:
gLogger.verbose( "Removing jobs with %s " % condDict )
result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )
if not result['OK']:
return result
jobList = result['Value']
if len(jobList) > self.maxJobsAtOnce:
jobList = jobList[:self.maxJobsAtOnce]
if not jobList:
return S_OK()
self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )
count = 0
error_count = 0
result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
if not result[ 'OK' ]:
gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )
if self.jobByJob:
for jobID in jobList:
resultJobDB = self.jobDB.removeJobFromDB( jobID )
resultTQ = self.taskQueueDB.deleteJob( jobID )
resultLogDB = self.jobLoggingDB.deleteJob( jobID )
#.........这里部分代码省略.........