[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] slot1: Job Requirements check failed!



Hi

this is a follow-up to my last email, but addressing a completely different 
issue.

I'm using a job submit file as follows:
Executable     = sleep.gpu.sh
Arguments      = 10 $$(GPU_DEV) $$(GPU_NAME) $$(GPU_CAPABILITY) \
$$(GPU_GLOBALMEM_MB) $$(GPU_MULTIPROC) $$(GPU_NUMCORES) $$(GPU_CLOCK_GHZ) \
$$(GPU_CUDA_DRV) $$(GPU_CUDA_RUN)
Error   = logs/err.$(Process)
Output  = logs/out.$(Process)
Log = /local/user/carsten/foo.log
Requirements = GPU_CAPABILITY >= 1.9
+WantGPU=True
Universe = vanilla
Queue 1

where sleep.gpu.sh is only printing out the arguments and sleeping for $1 
seconds.

With "Requirements = GPU_CAPABILITY >= 2.0" I'm trying to steer it to a 
machine which has this one set. It kind of works, the match is made, but when 
the startd wants to start the job, it just says "slot1: Job Requirements check 
failed!" and goes back to idle (full debug startLog attached).

$ gpu010:/var/log/condor# grep -i require /tmp/StartLog 
AutoClusterAttrs = 
"JobUniverse,LastCheckpointPlatform,NumCkpts,Scheduler,Owner,NeedGpu,WantGPU,DiskUsage,ImageSize,RequestMemory,FileSystemDomain,Requirements,NiceUser,ConcurrencyLimits"
Requirements = (GPU_CAPABILITY >= 2.000000) && (Arch == "X86_64") && (OpSys == 
"LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && 
((RequestMemory * 1024) >= ImageSize) && (TARGET.FileSystemDomain == 
MY.FileSystemDomain)
Requirements = (START) && (IsValidCheckpointPlatform)
AutoClusterAttrs = 
"JobUniverse,LastCheckpointPlatform,NumCkpts,Scheduler,Owner,NeedGpu,WantGPU,DiskUsage,ImageSize,RequestMemory,FileSystemDomain,Requirements,NiceUser,ConcurrencyLimits"
Requirements = (GPU_CAPABILITY >= 2.000000) && (Arch == "X86_64") && (OpSys == 
"LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && 
((RequestMemory * 1024) >= ImageSize) && (TARGET.FileSystemDomain == 
MY.FileSystemDomain)
Requirements = (START) && (IsValidCheckpointPlatform)
12/10 10:43:42 slot1: Job Requirements check failed!


I'm not quite sure what is causing Condor to not start the job, at first I 
thought it might be the floating-point comparison, but even with

Requirements = GPU_CAPABILITY >= 1.9

it matches, but does not start.

Any ideas?

Cheers

Carsten
12/10 10:42:00 Got SIGHUP.  Re-reading config files.
12/10 10:42:00 History file rotation is enabled.
12/10 10:42:00   Maximum history file size is: 20971520 bytes
12/10 10:42:00   Number of rotated history files is: 2
12/10 10:42:00 CronMgr: Job string is 'userban'
12/10 10:42:00 CronMgr: Job name is 'userban'
12/10 10:42:15 Cron: Running job 'userban' (/var/lib/condor/scripts/userban.pl)
12/10 10:43:42 slot1: match_info called
12/10 10:43:42 slot1: Received match <10.12.0.10:59313>#1291903313#269#...
12/10 10:43:42 slot1: State change: match notification protocol successful
12/10 10:43:42 slot1: Changing state: Owner -> Matched
12/10 10:43:42 slot1: REQ_CLASSAD:
MyType = "Job"
TargetType = "Machine"
GlobalJobId = "atlas4.atlas.aei.uni-hannover.de#1702098.0#1291974094"
LastJobStatus = 0
JobStatus = 1
ProcId = 0
AutoClusterId = 1
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,Scheduler,Owner,NeedGpu,WantGPU,DiskUsage,ImageSize,RequestMemory,FileSystemDomain,Requirements,NiceUser,ConcurrencyLimits"
WantMatchDiagnostics = TRUE
LastMatchTime = 1291974222
NumJobMatches = 1
ClusterId = 1702098
QDate = 1291974094
CompletionDate = 0
Owner = "carsten"
RemoteWallClockTime = 0.000000
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts_RAW = 0
NumCkpts = 0
NumJobStarts = 0
NumRestarts = 0
NumSystemHolds = 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
notification = NEVER
CondorVersion = "$CondorVersion: 7.4.4 Oct 13 2010 BuildID: 279383 $"
CondorPlatform = "$CondorPlatform: X86_64-LINUX_DEBIAN50 $"
RootDir = "/"
Iwd = "/home/carsten/condorcompile"
JobUniverse = 5
Cmd = "/home/carsten/condorcompile/sleep.gpu.sh"
MinHosts = 1
MaxHosts = 1
CurrentHosts = 0
WantRemoteSyscalls = FALSE
WantCheckpoint = FALSE
RequestCpus = 1
EnteredCurrentStatus = 1291974094
JobPrio = 0
User = "carsten@xxxxxxxxxxx"
NiceUser = FALSE
JobNotification = 0
WantRemoteIO = FALSE
UserLog = "/local/user/carsten/foo.log"
CoreSize = 0
KillSig = "SIGTERM"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out = "logs/out.0"
StreamOut = FALSE
Err = "logs/err.0"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "NO"
NeverCreateJobSandbox = TRUE
TransferFiles = "NEVER"
ImageSize_RAW = 1
ImageSize = 1
ExecutableSize_RAW = 1
ExecutableSize = 1
DiskUsage_RAW = 1
DiskUsage = 1
RequestMemory = ceiling(ifThenElse(JobVMMemory =!= UNDEFINED, JobVMMemory, ImageSize / 1024.000000))
RequestDisk = DiskUsage
Requirements = (GPU_CAPABILITY >= 2.000000) && (Arch == "X86_64") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && ((RequestMemory * 1024) >= ImageSize) && (TARGET.FileSystemDomain == MY.FileSystemDomain)
FileSystemDomain = "atlas.local"
JobLeaseDuration = 7200
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
OnExitHold = FALSE
OnExitRemove = TRUE
LeaveJobInQueue = FALSE
WantGPU = TRUE
Environment = ""
Arguments = "10 0 Tesla C2050 2.000000 2687 14 32 1.150000 3.200000 3.200000"
ServerTime = 1291974222
12/10 10:43:42 slot1: MACHINE_CLASSAD:
MyType = "Machine"
TargetType = "Job"
SUSPEND = FALSE
CONTINUE = TRUE
PREEMPT = (TARGET.ImageSize > 1.100000 * Memory * 1024)
KILL = FALSE
WANT_SUSPEND = FALSE
WANT_VACATE = FALSE
WANT_HOLD = FALSE
CLAIM_WORKLIFE = 3600
PERIODIC_CHECKPOINT = (CurrentTime - LastPeriodicCheckpoint) > ((60 * 60) + 10 * 60)
RunBenchmarks = (LastBenchmark == 0) || ((CurrentTime - LastBenchmark) >= (4 * (60 * 60)))
START_BACKFILL = (CurrentTime - EnteredCurrentState) > (1 * 60)
EVICT_BACKFILL = FALSE
IsOwner = (START =?= FALSE)
Name = "slot1@xxxxxxxxxxxxxxxxxx"
Rank = (target.WantGPU =?= TRUE) * 10000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
SlotWeight = Cpus
Unhibernate = MY.MachineLastMatchTime =!= UNDEFINED
MyCurrentTime = 1291974120
Machine = "gpu010.atlas.local"
PublicNetworkIpAddr = "<10.12.0.10:59313>"
GPU_DEV = 0
GPU_NAME = "Tesla C2050"
GPU_CAPABILITY = 2.000000
GPU_GLOBALMEM_MB = 2687
GPU_MULTIPROC = 14
GPU_NUMCORES = 32
GPU_CLOCK_GHZ = 1.150000
GPU_CUDA_DRV = 3.200000
GPU_CUDA_RUN = 3.200000
CondorVersion = "$CondorVersion: 7.4.2 Mar 30 2010 BuildID: 227044 $"
CondorPlatform = "$CondorPlatform: X86_64-LINUX_DEBIAN50 $"
SlotID = 1
VirtualMachineID = 1
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJobDeferral = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Free Software Foundation, Inc."
JavaVersion = "1.5.0"
JavaSpecificationVersion = "1.5"
JavaMFlops = 21.875904
HasJava = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJobDeferral,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasVM,HasRemoteSyscalls,HasCheckpointing"
LastBenchmark = 1291903325
TotalSlots = 4
TotalVirtualMachines = 4
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
VirtualMemory = 1048239
TotalDisk = 660312760
Disk = 165078192
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 70807
ConsoleIdle = 70807
Memory = 2960
Cpus = 1
StartdIpAddr = "<10.12.0.10:59313>"
Arch = "X86_64"
OpSys = "LINUX"
UidDomain = "atlas.local"
FileSystemDomain = "atlas.local"
HasIOProxy = TRUE
CheckpointPlatform = "LINUX X86_64 2.6.x normal 0x2aaaaaac7000"
TotalVirtualMemory = 4192956
TotalCpus = 4
TotalMemory = 11843
KFlops = 1276332
Mips = 9486
TotalLoadAvg = 0.180000
TotalCondorLoadAvg = 0.000000
ClockMin = 642
ClockDay = 5
HasVM = FALSE
HibernationLevel = 0
HibernationState = "NONE"
HibernationSupportedStates = ""
CanHibernate = FALSE
HardwareAddress = "00:e0:81:b8:9f:f5"
SubnetMask = "255.0.0.0"
IsWakeOnLanSupported = TRUE
IsWakeOnLanEnabled = TRUE
IsWakeAble = TRUE
WakeOnLanSupportedFlags = "Physical Packet,UniCast Packet,MultiCast Packet,BroadCast Packet,ARP Packet,Magic Packet"
WakeOnLanEnabledFlags = "Magic Packet"
MaxJobRetirementTime = 0
LastFetchWorkSpawned = 0
LastFetchWorkCompleted = 0
NextFetchWorkDelay = -1
CurrentRank = 0.000000
MonitorSelfTime = 1291973895
MonitorSelfCPUUsage = 0.049828
MonitorSelfImageSize = 30492.000000
MonitorSelfResidentSetSize = 5956
MonitorSelfAge = 0
MonitorSelfRegisteredSocketCount = 1
State = "Matched"
EnteredCurrentState = 1291974222
Activity = "Idle"
EnteredCurrentActivity = 1291974222
TotalTimeOwnerIdle = 67000
TotalTimeMatchedIdle = 1
TotalTimeClaimedIdle = 40
TotalTimeClaimedBusy = 3847
TotalTimePreemptingVacating = 15
Start = (Owner =?= "carsten") && (JobUniverse =?= 1 || target.WantGPU =?= TRUE)
Requirements = (START) && (IsValidCheckpointPlatform)
IsValidCheckpointPlatform = (((TARGET.JobUniverse == 1) == FALSE) || ((MY.CheckpointPlatform =!= UNDEFINED) && ((TARGET.LastCheckpointPlatform =?= MY.CheckpointPlatform) || (TARGET.NumCkpts == 0))))
12/10 10:43:42 slot1: Request accepted.
12/10 10:43:42 slot1: Remote owner is carsten@xxxxxxxxxxx
12/10 10:43:42 slot1: State change: claiming protocol successful
12/10 10:43:42 slot1: Changing state: Matched -> Claimed
12/10 10:43:42 slot1: Got activate_claim request from shadow (<10.20.30.4:58528>)
12/10 10:43:42 slot1: REQ_CLASSAD:
MyType = "(unknown type)"
TargetType = "(unknown type)"
GlobalJobId = "atlas4.atlas.aei.uni-hannover.de#1702098.0#1291974094"
ProcId = 0
AutoClusterId = 1
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,Scheduler,Owner,NeedGpu,WantGPU,DiskUsage,ImageSize,RequestMemory,FileSystemDomain,Requirements,NiceUser,ConcurrencyLimits"
WantMatchDiagnostics = TRUE
LastMatchTime = 1291974222
NumJobMatches = 1
OrigMaxHosts = 1
LastJobStatus = 1
JobStatus = 2
EnteredCurrentStatus = 1291974222
LastSuspensionTime = 0
CurrentHosts = 1
PublicClaimId = "<10.12.0.10:59313>#1291903313#269#..."
StartdIpAddr = "<10.12.0.10:59313>"
LastJobLeaseRenewal = 1291974222
RemoteHost = "slot1@xxxxxxxxxxxxxxxxxx"
RemoteSlotID = 1
StartdPrincipal = "10.12.0.10"
ShadowBday = 1291974222
JobStartDate = 1291974222
JobCurrentStartDate = 1291974222
NumShadowStarts = 1
JobRunCount = 1
ClusterId = 1702098
QDate = 1291974094
CompletionDate = 0
Owner = "carsten"
RemoteWallClockTime = 0.000000
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts_RAW = 0
NumCkpts = 0
NumJobStarts = 0
NumRestarts = 0
NumSystemHolds = 0
CommittedTime = 0
TotalSuspensions = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
notification = NEVER
CondorVersion = "$CondorVersion: 7.4.4 Oct 13 2010 BuildID: 279383 $"
CondorPlatform = "$CondorPlatform: X86_64-LINUX_DEBIAN50 $"
RootDir = "/"
Iwd = "/home/carsten/condorcompile"
JobUniverse = 5
Cmd = "/home/carsten/condorcompile/sleep.gpu.sh"
MinHosts = 1
MaxHosts = 1
WantRemoteSyscalls = FALSE
WantCheckpoint = FALSE
RequestCpus = 1
JobPrio = 0
User = "carsten@xxxxxxxxxxx"
NiceUser = FALSE
JobNotification = 0
WantRemoteIO = FALSE
UserLog = "/local/user/carsten/foo.log"
CoreSize = 0
KillSig = "SIGTERM"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out = "logs/out.0"
StreamOut = FALSE
Err = "logs/err.0"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "NO"
NeverCreateJobSandbox = TRUE
TransferFiles = "NEVER"
ImageSize_RAW = 1
ImageSize = 1
ExecutableSize_RAW = 1
ExecutableSize = 1
DiskUsage_RAW = 1
DiskUsage = 1
RequestMemory = ceiling(ifThenElse(JobVMMemory =!= UNDEFINED, JobVMMemory, ImageSize / 1024.000000))
RequestDisk = DiskUsage
Requirements = (GPU_CAPABILITY >= 2.000000) && (Arch == "X86_64") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && ((RequestMemory * 1024) >= ImageSize) && (TARGET.FileSystemDomain == MY.FileSystemDomain)
FileSystemDomain = "atlas.local"
JobLeaseDuration = 7200
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
OnExitHold = FALSE
OnExitRemove = TRUE
LeaveJobInQueue = FALSE
WantGPU = TRUE
Environment = ""
Arguments = "10 0 Tesla C2050 2.000000 2687 14 32 1.150000 3.200000 3.200000"
MyAddress = "<10.20.30.4:37578>"
12/10 10:43:42 slot1: MACHINE_CLASSAD:
MyType = "Machine"
TargetType = "Job"
SUSPEND = FALSE
CONTINUE = TRUE
PREEMPT = (TARGET.ImageSize > 1.100000 * Memory * 1024)
KILL = FALSE
WANT_SUSPEND = FALSE
WANT_VACATE = FALSE
WANT_HOLD = FALSE
CLAIM_WORKLIFE = 3600
PERIODIC_CHECKPOINT = (CurrentTime - LastPeriodicCheckpoint) > ((60 * 60) + 10 * 60)
RunBenchmarks = (LastBenchmark == 0) || ((CurrentTime - LastBenchmark) >= (4 * (60 * 60)))
START_BACKFILL = (CurrentTime - EnteredCurrentState) > (1 * 60)
EVICT_BACKFILL = FALSE
IsOwner = (START =?= FALSE)
Name = "slot1@xxxxxxxxxxxxxxxxxx"
Rank = (target.WantGPU =?= TRUE) * 10000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
SlotWeight = Cpus
Unhibernate = MY.MachineLastMatchTime =!= UNDEFINED
MyCurrentTime = 1291974120
Machine = "gpu010.atlas.local"
PublicNetworkIpAddr = "<10.12.0.10:59313>"
CondorVersion = "$CondorVersion: 7.4.2 Mar 30 2010 BuildID: 227044 $"
CondorPlatform = "$CondorPlatform: X86_64-LINUX_DEBIAN50 $"
SlotID = 1
VirtualMachineID = 1
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJobDeferral = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Free Software Foundation, Inc."
JavaVersion = "1.5.0"
JavaSpecificationVersion = "1.5"
JavaMFlops = 21.875904
HasJava = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJobDeferral,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasVM,HasRemoteSyscalls,HasCheckpointing"
ImageSize = 1
ExecutableSize = 1
JobUniverse = 5
NiceUser = FALSE
LastBenchmark = 1291903325
TotalSlots = 4
TotalVirtualMachines = 4
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
VirtualMemory = 1048239
TotalDisk = 660312760
Disk = 165078192
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 70903
ConsoleIdle = 70903
Memory = 2960
Cpus = 1
StartdIpAddr = "<10.12.0.10:59313>"
Arch = "X86_64"
OpSys = "LINUX"
UidDomain = "atlas.local"
FileSystemDomain = "atlas.local"
HasIOProxy = TRUE
CheckpointPlatform = "LINUX X86_64 2.6.x normal 0x2aaaaaac7000"
TotalVirtualMemory = 4192956
TotalCpus = 4
TotalMemory = 11843
KFlops = 1276332
Mips = 9486
TotalLoadAvg = 0.030000
TotalCondorLoadAvg = 0.000000
ClockMin = 643
ClockDay = 5
HasVM = FALSE
HibernationLevel = 0
HibernationState = "NONE"
HibernationSupportedStates = ""
CanHibernate = FALSE
HardwareAddress = "00:e0:81:b8:9f:f5"
SubnetMask = "255.0.0.0"
IsWakeOnLanSupported = TRUE
IsWakeOnLanEnabled = TRUE
IsWakeAble = TRUE
WakeOnLanSupportedFlags = "Physical Packet,UniCast Packet,MultiCast Packet,BroadCast Packet,ARP Packet,Magic Packet"
WakeOnLanEnabledFlags = "Magic Packet"
State = "Claimed"
EnteredCurrentState = 1291974222
Activity = "Idle"
EnteredCurrentActivity = 1291974222
TotalTimeOwnerIdle = 67000
TotalTimeMatchedIdle = 1
TotalTimeClaimedIdle = 40
TotalTimeClaimedBusy = 3847
TotalTimePreemptingVacating = 15
Start = (Owner =?= "carsten") && (JobUniverse =?= 1 || target.WantGPU =?= TRUE)
Requirements = (START) && (IsValidCheckpointPlatform)
IsValidCheckpointPlatform = (((TARGET.JobUniverse == 1) == FALSE) || ((MY.CheckpointPlatform =!= UNDEFINED) && ((TARGET.LastCheckpointPlatform =?= MY.CheckpointPlatform) || (TARGET.NumCkpts == 0))))
MaxJobRetirementTime = 0
LastFetchWorkSpawned = 0
LastFetchWorkCompleted = 0
NextFetchWorkDelay = -1
CurrentRank = 10000000.000000
RemoteUser = "carsten@xxxxxxxxxxx"
RemoteOwner = "carsten@xxxxxxxxxxx"
ClientMachine = "atlas4.atlas.local"
MonitorSelfTime = 1291974135
MonitorSelfCPUUsage = 0.096162
MonitorSelfImageSize = 30500.000000
MonitorSelfResidentSetSize = 5980
MonitorSelfAge = 0
MonitorSelfRegisteredSocketCount = 1
12/10 10:43:42 slot1: Job Requirements check failed!
12/10 10:43:42 slot1: Called deactivate_claim_forcibly()
12/10 10:43:42 slot1: State change: received RELEASE_CLAIM command
12/10 10:43:42 slot1: Changing state and activity: Claimed/Idle -> Preempting/Vacating
12/10 10:43:42 slot1: State change: No preempting claim, returning to owner
12/10 10:43:42 slot1: Changing state and activity: Preempting/Vacating -> Owner/Idle