[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] Example jobs don't start



Hi all

I just setup a condor pool, with 4 machines : 
io = central manager
vrubel, goya (bi-cpu) : execute machine
chagall : submit machine

I ran the example job : only N° 1 and 3 ran, the others just don't start, and I don't know why (well : machine rejected the job "because of their own requirements" : but which ?)

- I don't really understand why N° 1 and 3 ran on "vrubel", but nothing started on "goya"

- Is there a command to know for each machine, which criteria the job didn't complete ?

Thanks for your help (and sorry for the long mail, but the info should be here...)

_______________________________________
guiot@chagall$ condor_status

Name          OpSys       Arch   State      Activity   LoadAv Mem   ActvtyTime

vm1@xxxxxxxxx LINUX       INTEL  Owner      Idle       0.000   188  1+19:50:21
vm2@xxxxxxxxx LINUX       INTEL  Owner      Idle       0.000   188  1+19:50:22
vrubel.galaxy LINUX       INTEL  Claimed    Busy       0.000   250  0+00:11:46

                     Machines Owner Claimed Unclaimed Matched Preempting

         INTEL/LINUX        3     2       1         0       0          0

               Total        3     2       1         0       0          0
______________________________________

$ condor_q -analyze


-- Submitter: chagall.galaxy.ibpc.fr : <193.49.27.24:48041> : chagall.galaxy.ibpc.fr
 ID      OWNER            SUBMITTED     RUN_TIME ST PRI SIZE CMD
---
002.000:  Run analysis summary.  Of 3 machines,
      0 are rejected by your job's requirements
      2 reject your job because of their own requirements
      1 match but are serving users with a better priority in the pool
      0 match but reject the job for unknown reasons
      0 match but will not currently preempt their existing job
      0 are available to run your job
        Last successful match: Wed Sep 21 04:37:51 2005
        Last failed match: Wed Sep 21 11:23:29 2005
        Reason for last match failure: no match found
---
004.000:  Request is being serviced

---
004.001:  Run analysis summary.  Of 3 machines,
      0 are rejected by your job's requirements
      2 reject your job because of their own requirements
      1 match but are serving users with a better priority in the pool
      0 match but reject the job for unknown reasons
      0 match but will not currently preempt their existing job
      0 are available to run your job
        No successful match recorded.
        Last failed match: Wed Sep 21 11:23:29 2005
        Reason for last match failure: no match found
---
004.002:  Run analysis summary.  Of 3 machines,
      0 are rejected by your job's requirements
      2 reject your job because of their own requirements
      1 match but are serving users with a better priority in the pool
      0 match but reject the job for unknown reasons
      0 match but will not currently preempt their existing job
      0 are available to run your job
....
______________________________________
guiot@chagall$ condor_status -l
MyType = "Machine"
TargetType = "Job"
Name = "vm1@xxxxxxxxxxxxxxxxxxx"
Machine = "goya.galaxy.ibpc.fr"
Rank = 0.000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "io.galaxy.ibpc.fr"
CondorVersion = "$CondorVersion: 6.7.10 Aug  3 2005 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
VirtualMachineID = 1
VirtualMemory = 1048568
Disk = 2451806
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 0
ConsoleIdle = 0
Memory = 188
Cpus = 1
StartdIpAddr = "<193.49.27.81:32772>"
Arch = "INTEL"
OpSys = "LINUX"
UidDomain = "galaxy.ibpc.fr"
FileSystemDomain = "galaxy.ibpc.fr"
Subnet = "193.49.27"
HasIOProxy = TRUE
TotalVirtualMemory = 2097136
TotalDisk = 4903612
TotalCpus = 2
TotalMemory = 376
KFlops = 138441
Mips = 504
LastBenchmark = 1127136637
TotalLoadAvg = 0.000000
TotalCondorLoadAvg = 0.000000
ClockMin = 680
ClockDay = 3
TotalVirtualMachines = 2
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Sun Microsystems Inc."
JavaVersion = "1.4.2_05"
JavaMFlops = 49.363747
HasJava = TRUE
HasPVM = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasPVM,HasRemoteSyscalls,HasCheckpointing"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Owner"
EnteredCurrentState = 1127136624
Activity = "Idle"
EnteredCurrentActivity = 1127136624
Start = ((KeyboardIdle > 15 * 60) && (((LoadAvg - CondorLoadAvg) <= 0.300000) || (State != "Unclaimed" && State != "Owner")))
Requirements = START
MaxJobRetirementTime = 0
CurrentRank = 0.000000
MonitorSelfTime = 1127294317
MonitorSelfCPUUsage = 0.016666
MonitorSelfImageSize = 7060.000000
MonitorSelfResidentSetSize = 3380
MonitorSelfAge = 157710
DaemonStartTime = 1127136623
UpdateSequenceNumber = 526
MyAddress = "<193.49.27.81:32772>"
LastHeardFrom = 1127294445
UpdatesTotal = 521
UpdatesSequenced = 520
UpdatesLost = 2
UpdatesHistory = "0x00000000000000000000000000000000"

MyType = "Machine"
TargetType = "Job"
Name = "vm2@xxxxxxxxxxxxxxxxxxx"
Machine = "goya.galaxy.ibpc.fr"
Rank = 0.000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "io.galaxy.ibpc.fr"
CondorVersion = "$CondorVersion: 6.7.10 Aug  3 2005 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
VirtualMachineID = 2
VirtualMemory = 1048568
Disk = 2451806
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 0
ConsoleIdle = 0
Memory = 188
Cpus = 1
StartdIpAddr = "<193.49.27.81:32772>"
Arch = "INTEL"
OpSys = "LINUX"
UidDomain = "galaxy.ibpc.fr"
FileSystemDomain = "galaxy.ibpc.fr"
Subnet = "193.49.27"
HasIOProxy = TRUE
TotalVirtualMemory = 2097136
TotalDisk = 4903612
TotalCpus = 2
TotalMemory = 376
KFlops = 138441
Mips = 504
LastBenchmark = 1127136637
TotalLoadAvg = 0.000000
TotalCondorLoadAvg = 0.000000
ClockMin = 680
ClockDay = 3
TotalVirtualMachines = 2
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Sun Microsystems Inc."
JavaVersion = "1.4.2_05"
JavaMFlops = 49.363747
HasJava = TRUE
HasPVM = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasPVM,HasRemoteSyscalls,HasCheckpointing"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Owner"
EnteredCurrentState = 1127136624
Activity = "Idle"
EnteredCurrentActivity = 1127136624
Start = ((KeyboardIdle > 15 * 60) && (((LoadAvg - CondorLoadAvg) <= 0.300000) || (State != "Unclaimed" && State != "Owner")))
Requirements = START
MaxJobRetirementTime = 0
CurrentRank = 0.000000
MonitorSelfTime = 1127294317
MonitorSelfCPUUsage = 0.016666
MonitorSelfImageSize = 7060.000000
MonitorSelfResidentSetSize = 3380
MonitorSelfAge = 157710
DaemonStartTime = 1127136623
UpdateSequenceNumber = 526
MyAddress = "<193.49.27.81:32772>"
LastHeardFrom = 1127294446
UpdatesTotal = 522
UpdatesSequenced = 521
UpdatesLost = 0
UpdatesHistory = "0x00000000000000000000000000000000"

MyType = "Machine"
TargetType = "Job"
Name = "vrubel.galaxy.ibpc.fr"
Machine = "vrubel.galaxy.ibpc.fr"
Rank = 0.000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "io.galaxy.ibpc.fr"
CondorVersion = "$CondorVersion: 6.7.10 Aug  3 2005 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
VirtualMachineID = 1
ExecutableSize = 12161
JobUniverse = 1
NiceUser = FALSE
ImageSize = 7208
VirtualMemory = 1051900
Disk = 4992980
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 3116
ConsoleIdle = 155033
Memory = 250
Cpus = 1
StartdIpAddr = "<193.49.27.11:32772>"
Arch = "INTEL"
OpSys = "LINUX"
UidDomain = "galaxy.ibpc.fr"
FileSystemDomain = "galaxy.ibpc.fr"
Subnet = "193.49.27"
HasIOProxy = TRUE
TotalVirtualMemory = 1051900
TotalDisk = 4992980
TotalCpus = 1
TotalMemory = 250
KFlops = 97422
Mips = 496
LastBenchmark = 1127270034
TotalLoadAvg = 0.000000
TotalCondorLoadAvg = 0.000000
ClockMin = 680
ClockDay = 3
TotalVirtualMachines = 1
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Sun Microsystems Inc."
JavaVersion = "1.4.2_05"
JavaMFlops = 47.434353
HasJava = TRUE
HasPVM = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasPVM,HasRemoteSyscalls,HasCheckpointing"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Claimed"
EnteredCurrentState = 1127278383
Activity = "Busy"
EnteredCurrentActivity = 1127293754
Start = ((KeyboardIdle > 15 * 60) && (((LoadAvg - CondorLoadAvg) <= 0.300000) || (State != "Unclaimed" && State != "Owner")))
Requirements = START
MaxJobRetirementTime = 0
CurrentRank = 0.000000
RemoteUser = "condor@xxxxxxxxxxxxxx"
RemoteOwner = "condor@xxxxxxxxxxxxxx"
ClientMachine = "chagall.galaxy.ibpc.fr"
JobId = "4.0"
GlobalJobId = "chagall.galaxy.ibpc.fr#1127137022#4.0"
JobStart = 1127293754
LastPeriodicCheckpoint = 1127293754
TotalJobRunTime = 706
TotalClaimRunTime = 15761
TotalClaimSuspendTime = 306
MonitorSelfTime = 1127294388
MonitorSelfCPUUsage = 0.266182
MonitorSelfImageSize = 7212.000000
MonitorSelfResidentSetSize = 3652
MonitorSelfAge = 157457
DaemonStartTime = 1127136944
UpdateSequenceNumber = 590
MyAddress = "<193.49.27.11:32772>"
LastHeardFrom = 1127294460
UpdatesTotal = 586
UpdatesSequenced = 585
UpdatesLost = 3
UpdatesHistory = "0x00000000000000000000000000000010"
________________________________________
guiot@chagall:/ibpc/io/condor/condor-6.7.10/examples$ condor_q


-- Submitter: chagall.galaxy.ibpc.fr : <193.49.27.24:48041> : chagall.galaxy.ibpc.fr
 ID      OWNER            SUBMITTED     RUN_TIME ST PRI SIZE CMD
   2.0   condor          9/19 15:36   1+07:16:33 I  0   11.8 env.remote foo bar
   4.0   condor          9/19 15:37   0+12:59:54 R  0   11.9 loop.remote 200
   4.1   condor          9/19 15:37   0+00:00:00 I  0   11.9 loop.remote 200
   4.2   condor          9/19 15:37   0+00:00:00 I  0   11.9 loop.remote 300
   4.3   condor          9/19 15:37   0+00:00:00 I  0   11.9 loop.remote 300
   4.4   condor          9/19 15:37   0+00:00:00 I  0   11.9 loop.remote 500
   5.0   condor          9/19 15:37   0+00:00:00 I  0   11.9 registers.remote
   6.0   condor          9/19 15:37   0+00:00:00 I  0   12.1 reader.remote
   7.0   condor          9/19 15:37   0+00:00:00 I  0   12.1 printer.remote
   8.0   condor          9/19 15:38   0+00:00:00 I  0   12.1 fortIO.remote
   9.0   condor          9/19 15:38   0+00:00:00 I  0   0.0  sh_loop 60

11 jobs; 10 idle, 1 running, 0 held
guiot@chagall$ condor_q -l


-- Submitter: chagall.galaxy.ibpc.fr : <193.49.27.24:48041> : chagall.galaxy.ibpc.fr
MyType = "Job"
TargetType = "Machine"
ClusterId = 2
QDate = 1127136987
CompletionDate = 0
Owner = "condor"
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
NumSystemHolds = 0
CommittedTime = 0
TotalSuspensions = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion = "$CondorVersion: 6.7.10 Aug  3 2005 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
RootDir = "/"
Iwd = "/ibpc/io/condor/condor-6.7.10/examples"
JobUniverse = 1
Cmd = "/ibpc/io/condor/condor-6.7.10/examples/env.remote"
MinHosts = 1
WantRemoteSyscalls = TRUE
WantCheckpoint = TRUE
RemoteSpoolDir = "/scratch/condor/spool/cluster2.proc0.subproc0"
JobPrio = 0
User = "condor@xxxxxxxxxxxxxx"
NiceUser = FALSE
MaxJobRetirementTime = 0
Env = "alpha=a;bravo=b;charlie=c"
JobNotification = 2
WantRemoteIO = TRUE
UserLog = "/ibpc/io/condor/condor-6.7.10/examples/env.log"
CoreSize = 0
KillSig = "SIGTSTP"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out = "env.out"
StreamOut = FALSE
Err = "env.err"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "NO"
TransferFiles = "NEVER"
ImageSize = 12131
ExecutableSize = 12131
DiskUsage = 12131
Requirements = (Arch == "INTEL") && (OpSys == "LINUX") && ((CkptArch == Arch) || (CkptArch =?= UNDEFINED)) && ((CkptOpSys == OpSys) || (CkptOpSys =?= UNDEFINED)) && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize)
FileSystemDomain = "galaxy.ibpc.fr"
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
OnExitHold = FALSE
OnExitRemove = TRUE
LeaveJobInQueue = FALSE
Args = "foo bar glarch"
GlobalJobId = "chagall.galaxy.ibpc.fr#1127136987#2.0"
ProcId = 0
JobStartDate = 1127138695
LastMatchTime = 1127270271
NumJobMatches = 4
OrigMaxHosts = 1
JobLastStartDate = 1127270275
JobCurrentStartDate = 1127275393
JobRunCount = 22
LastJobLeaseRenewal = 1127277906
RemoteWallClockTime = 112593.000000
LastRemoteHost = "vrubel.galaxy.ibpc.fr"
LastClaimId = "<193.49.27.11:32772>#1127136932#13"
CurrentHosts = 0
JobStatus = 1
EnteredCurrentStatus = 1127280509
LastSuspensionTime = 0
MaxHosts = 1
WantMatchDiagnostics = TRUE
LastRejMatchReason = "no match found"
LastRejMatchTime = 1127294309
ServerTime = 1127294486







-----------------------------------------------
CNRS - UPR 9080 : Laboratoire de Biochimie Theorique
Institut de Biologie Physico Chimique
13 rue Pierre et Marie Curie
75005 PARIS - FRANCE

Tel : +33 158 41 51 70
Fax : +33 158 41 50 26
------------------------------------------------