[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] Scheduller does not sends a job to the matching node PLEASE HELP



Hi,
Thanks in advance

I already a cluster with one central manager that can run jobs and
another computer that I suppose accept jobs.

here is the condor_status
root@aolias:/usr/condor-6.8.1/local.aolias/log# condor_status

Name          OpSys       Arch   State      Activity   LoadAv Mem   ActvtyTime

aolias.indra. LINUX       INTEL  Unclaimed  Idle       0.290  1011  0+02:04:16
vm1@amdelcura WINNT51     INTEL  Unclaimed  Idle       0.000  1019  0+00:30:08
vm2@amdelcura WINNT51     INTEL  Unclaimed  Idle       0.010  1019  0+00:30:09

                    Total Owner Claimed Unclaimed Matched Preempting Backfill

        INTEL/LINUX     1     0       0         1       0          0        0
      INTEL/WINNT51     2     0       0         2       0          0        0

              Total     3     0       0         3       0          0        0

I have a java job that I can sumbit to the cluster and is executed by
the central manager aolias.indra.es (the LINUX machine) but I have
changed the job description to in order to be executed in the Windows
machine

So the job file description iaolias@aolias:~/Condor_Sample$ cat
submit.java.HelloCondor
########################
# Submit description file for hello program
########################
Executable     = HelloWorld.class
Arguments      = HelloWorld
Universe       = java
Output         = hello.java.out
Log            = hello.java.log
Error          = hello.java.err
java_vm_args   = -Xmx64M
should_transfer_files = YES
when_to_transfer_output = ON_EXIT
Requirements   = Memory >= 64 && OpSys == "WINNT51"
Queue


When I submit this job it is supposed to be executed on any of both
vm1@amdelcura or vm2@amdelcura but always gets IDLE.
There must be some config variable I have to change but I do not know
what to look

I post more info about the cluster, the task and log cluster files

################################################
cat SchedLog
9/25 19:57:48 (pid:6355) DaemonCore: Command received via UDP from
host <172.22.61.11:33043>
9/25 19:57:48 (pid:6355) DaemonCore: received command 421
(RESCHEDULE), calling handler (reschedule_negotiator)
9/25 19:57:48 (pid:6355) Sent ad to central manager for aolias@xxxxxxxxxxxxxxx
9/25 19:57:48 (pid:6355) Sent ad to 1 collectors for aolias@xxxxxxxxxxxxxxx
9/25 19:57:48 (pid:6355) Called reschedule_negotiator()
9/25 19:57:48 (pid:6355) Activity on stashed negotiator socket
9/25 19:57:48 (pid:6355) Negotiating for owner: aolias@xxxxxxxxxxxxxxx
9/25 19:57:48 (pid:6355) Checking consistency running and runnable jobs
9/25 19:57:48 (pid:6355) Tables are consistent
9/25 19:57:48 (pid:6355) Out of servers - 0 jobs matched, 1 jobs idle,
1 jobs rejected
################################################
cat MatchLog9/25 19:57:48       Rejected 16.0 aolias@xxxxxxxxxxxxxxx
<172.22.61.11:48545>: no match found

################################################




###############################################
root@aolias:/usr/condor-6.8.1/local.aolias/log# condor_q -l


-- Submitter: aolias.indra.es : <172.22.61.11:48545> : aolias.indra.es
MyType = "Job"
TargetType = "Machine"
ClusterId = 16
QDate = 1159207067
CompletionDate = 0
Owner = "aolias"
RemoteWallClockTime = 0.000000
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
NumSystemHolds = 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion = "$CondorVersion: 6.8.1 Sep 17 2006  $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RHEL3 $"
RootDir = "/"
Iwd = "/home/aolias/Condor_Sample"
JobUniverse = 10
MinHosts = 1
MaxHosts = 1
CurrentHosts = 0
WantRemoteSyscalls = FALSE
WantCheckpoint = FALSE
JobStatus = 1
EnteredCurrentStatus = 1159207068
JobPrio = 0
User = "aolias@xxxxxxxxxxxxxxx"
NiceUser = FALSE
Environment = ""
JobNotification = 2
WantRemoteIO = TRUE
UserLog = "/home/aolias/Condor_Sample/hello.java.log"
CoreSize = 0
KillSig = "SIGTERM"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out = "hello.java.out"
StreamOut = FALSE
Err = "hello.java.err"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "YES"
WhenToTransferOutput = "ON_EXIT"
TransferFiles = "ONEXIT"
Cmd = "java"
TransferExecutable = FALSE
TransferInput = "HelloWorld.class"
ImageSize_RAW = 0
ImageSize = 0
ExecutableSize_RAW = 0
ExecutableSize = 0
DiskUsage_RAW = 1
DiskUsage = 10000
Requirements = (Memory >= 64 && OpSys == "WINNT51") && (HasJava) &&
(Disk >= DiskUsage) && (HasFileTransfer)
JobLeaseDuration = 1200
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
OnExitHold = FALSE
OnExitRemove = TRUE
LeaveJobInQueue = FALSE
Args = "HelloWorld"
JavaVMArgs = "-Xmx64M"
GlobalJobId = "aolias.indra.es#1159207068#16.0"
ProcId = 0
AutoClusterId = 0
AutoClusterAttrs =
"JobUniverse,LastCheckpointPlatform,NumCkpts,DiskUsage,Requirements,NiceUser"
WantMatchDiagnostics = TRUE
LastRejMatchReason = "no match found"
LastRejMatchTime = 1159207068
ServerTime = 1159207082


###############################################
root@aolias:/usr/condor-6.8.1/local.aolias/log# condor_status -l
MyType = "Machine"
TargetType = "Job"
Name = "aolias.indra.es"
Machine = "aolias.indra.es"
Rank = 0.000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "172.22.61.11"
CondorVersion = "$CondorVersion: 6.8.1 Sep 17 2006  $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RHEL3 $"
VirtualMachineID = 1
VirtualMemory = 1052216
Disk = 6501016
CondorLoadAvg = 0.000000
LoadAvg = 0.280000
KeyboardIdle = 0
ConsoleIdle = 0
Memory = 1011
Cpus = 1
StartdIpAddr = "<172.22.61.11:55265>"
Arch = "INTEL"
OpSys = "LINUX"
UidDomain = "aolias.indra.es"
FileSystemDomain = "aolias.indra.es"
Subnet = "172.22.61"
HasIOProxy = TRUE
CheckpointPlatform = "LINUX INTEL 2.6.x normal"
TotalVirtualMemory = 1052216
TotalDisk = 6501016
TotalCpus = 1
TotalMemory = 1011
KFlops = 377289
Mips = 1116
LastBenchmark = 1159199054
TotalLoadAvg = 0.280000
TotalCondorLoadAvg = 0.000000
ClockMin = 1194
ClockDay = 1
TotalVirtualMachines = 1
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJobDeferral = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Sun Microsystems Inc."
JavaVersion = "1.5.0_08"
JavaMFlops = 108.137955
HasJava = TRUE
HasPVM = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList =
"HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJobDeferral,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasPVM,HasRemoteSyscalls,HasCheckpointing"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Unclaimed"
EnteredCurrentState = 1159199102
Activity = "Idle"
EnteredCurrentActivity = 1159199102
Start = TRUE
Requirements = (START) && (IsValidCheckpointPlatform)
IsValidCheckpointPlatform = (((TARGET.JobUniverse == 1) == FALSE) ||
((MY.CheckpointPlatform =!= UNDEFINED) &&
((TARGET.LastCheckpointPlatform =?= MY.CheckpointPlatform) ||
(TARGET.NumCkpts == 0))))
MaxJobRetirementTime = 0
CurrentRank = 0.000000
MonitorSelfTime = 1159206734
MonitorSelfCPUUsage = 0.004166
MonitorSelfImageSize = 7976.000000
MonitorSelfResidentSetSize = 3716
MonitorSelfAge = 0
MonitorSelfRegisteredSocketCount = 2
DaemonStartTime = 1159199047
UpdateSequenceNumber = 30
MyAddress = "<172.22.61.11:55265>"
LastHeardFrom = 1159206858
UpdatesTotal = 31
UpdatesSequenced = 30
UpdatesLost = 0
UpdatesHistory = "0x00000000000000000000000000000000"

MyType = "Machine"
TargetType = "Job"
Name = "vm1@xxxxxxxxxxxxxxxxxx"
Machine = "amdelcura.indra.es"
Rank = 0.000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "172.22.61.11"
CondorVersion = "$CondorVersion: 6.8.1 Sep 18 2006  $"
CondorPlatform = "$CondorPlatform: INTEL-WINNT50 $"
VirtualMachineID = 1
VirtualMemory = 1487430
Disk = 25958696
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 0
ConsoleIdle = 0
Memory = 1019
Cpus = 1
StartdIpAddr = "<172.22.61.27:4230>"
Arch = "INTEL"
OpSys = "WINNT51"
UidDomain = "indra.es"
FileSystemDomain = "amdelcura.indra.es"
Subnet = "172.22.61"
HasIOProxy = TRUE
CheckpointPlatform = "WINNT51 INTEL Unknown normal"
TotalVirtualMemory = 2974860
TotalDisk = 51917392
TotalCpus = 2
TotalMemory = 2038
KFlops = 686323
Mips = 2277
LastBenchmark = 1159204790
TotalLoadAvg = 0.010000
TotalCondorLoadAvg = 0.000000
ClockMin = 1189
ClockDay = 1
TotalVirtualMachines = 2
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJobDeferral = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
HasWindowsRunAsOwner = TRUE
StarterAbilityList =
"HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJobDeferral,HasJICLocalConfig,HasJICLocalStdin,HasWindowsRunAsOwner"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Unclaimed"
EnteredCurrentState = 1159204790
Activity = "Idle"
EnteredCurrentActivity = 1159204790
Start = TRUE
Requirements = (START) && (IsValidCheckpointPlatform)
IsValidCheckpointPlatform = (((TARGET.JobUniverse == 1) == FALSE) ||
((MY.CheckpointPlatform =!= UNDEFINED) &&
((TARGET.LastCheckpointPlatform =?= MY.CheckpointPlatform) ||
(TARGET.NumCkpts == 0))))
MaxJobRetirementTime = 0
CurrentRank = 0.000000
MonitorSelfTime = 1159206470
MonitorSelfCPUUsage = 0.013020
MonitorSelfImageSize = 50272.000000
MonitorSelfResidentSetSize = 9632
MonitorSelfAge = 1690
MonitorSelfRegisteredSocketCount = 2
DaemonStartTime = 1159204780
UpdateSequenceNumber = 6
MyAddress = "<172.22.61.27:4230>"
LastHeardFrom = 1159206598
UpdatesTotal = 25
UpdatesSequenced = 22
UpdatesLost = 1
UpdatesHistory = "0x00400000000000000000000000000000"

MyType = "Machine"
TargetType = "Job"
Name = "vm2@xxxxxxxxxxxxxxxxxx"
Machine = "amdelcura.indra.es"
Rank = 0.000000
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "172.22.61.11"
CondorVersion = "$CondorVersion: 6.8.1 Sep 18 2006  $"
CondorPlatform = "$CondorPlatform: INTEL-WINNT50 $"
VirtualMachineID = 2
VirtualMemory = 1487430
Disk = 25958696
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 0
ConsoleIdle = 0
Memory = 1019
Cpus = 1
StartdIpAddr = "<172.22.61.27:4230>"
Arch = "INTEL"
OpSys = "WINNT51"
UidDomain = "indra.es"
FileSystemDomain = "amdelcura.indra.es"
Subnet = "172.22.61"
HasIOProxy = TRUE
CheckpointPlatform = "WINNT51 INTEL Unknown normal"
TotalVirtualMemory = 2974860
TotalDisk = 51917392
TotalCpus = 2
TotalMemory = 2038
KFlops = 686323
Mips = 2277
LastBenchmark = 1159204790
TotalLoadAvg = 0.000000
TotalCondorLoadAvg = 0.000000
ClockMin = 1194
ClockDay = 1
TotalVirtualMachines = 2
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJobDeferral = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
HasWindowsRunAsOwner = TRUE
StarterAbilityList =
"HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJobDeferral,HasJICLocalConfig,HasJICLocalStdin,HasWindowsRunAsOwner"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Unclaimed"
EnteredCurrentState = 1159204790
Activity = "Idle"
EnteredCurrentActivity = 1159204790
Start = TRUE
Requirements = (START) && (IsValidCheckpointPlatform)
IsValidCheckpointPlatform = (((TARGET.JobUniverse == 1) == FALSE) ||
((MY.CheckpointPlatform =!= UNDEFINED) &&
((TARGET.LastCheckpointPlatform =?= MY.CheckpointPlatform) ||
(TARGET.NumCkpts == 0))))
MaxJobRetirementTime = 0
CurrentRank = 0.000000
MonitorSelfTime = 1159206710
MonitorSelfCPUUsage = 0.006510
MonitorSelfImageSize = 50272.000000
MonitorSelfResidentSetSize = 9632
MonitorSelfAge = 1930
MonitorSelfRegisteredSocketCount = 2
DaemonStartTime = 1159204780
UpdateSequenceNumber = 7
MyAddress = "<172.22.61.27:4230>"
LastHeardFrom = 1159206899
UpdatesTotal = 26
UpdatesSequenced = 23
UpdatesLost = 2
UpdatesHistory = "0x00010080000000000000000000000000"