[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] Why does job submitted by virtual machine(running Xen 3.0) not run?



Hi,
I have a Condor pool under linux.There is one virtual machine(running Xen 3.0) and several real machines in the Condor pool. All the jobs submitted by the real machines run quite well in the pool. But as for the jobs submitted by the virtual machine(running Xen 3.0),they always stay 'idle' even all the machine in the pool were in the 'unclaimed' state. Can anyone tell me why and how to force the job to run?
Thank you in advance for your help.
 
The operating system of the virtual machine is Centos 4.2, and all the operating systems of the real machine are
Centos 4.3. I am using Condor 6.7.19 for Fedora Core 4 in the Condor pool.
 
The state of the Condor pool is as follows(gcnode034.cap is the virtual machine,gcnode022.cap,gcnode026.cap,gcnode038.cap are real machines):
 
Name                   OpSys         Arch      State          Activity   LoadAv Mem   ActvtyTime
 
vm1@gcnode022 LINUX       INTEL  Unclaimed  Idle       0.000  1014  0+01:32:10
vm2@gcnode022 LINUX       INTEL  Unclaimed  Idle       0.000  1014  0+03:42:42
vm3@gcnode022 LINUX       INTEL  Unclaimed  Idle       0.000  1014  0+03:43:46
vm4@gcnode022 LINUX       INTEL  Unclaimed  Idle       0.000  1014  0+03:43:42
vm1@gcnode026 LINUX       INTEL  Unclaimed  Idle       0.000   503  0+03:18:47
vm2@gcnode026 LINUX       INTEL  Unclaimed  Idle       0.000   503  0+01:05:05
gcnode034.cap     LINUX       INTEL  Unclaimed  Idle       0.000   800  0+03:21:40
gcnode038.cap     LINUX       INTEL  Unclaimed  Idle       0.000   800  0+02:56:40
 
                     Total Owner Claimed Unclaimed Matched Preempting Backfill
 
         INTEL/LINUX     8     0       0         8       0          0        0
 
                          Total     8     0       0         8       0          0        0

 
I have edited the local config file(condor_config) of every machine in the pool and add the following lines:
 
START        = True
SUSPEND    = False
CONTINUE  = True
PREEMPT    = False
KILL          = False
 
But the jobs still stay 'idle'.
 
When I use the condor_q -l  command,I can see as follows:
 
-- Submitter: gcnode034.cap : <192.168.10.34:47204> : gcnode034.cap
MyType = "Job"
TargetType = "Machine"
ClusterId = 12
QDate = 1149147167
CompletionDate = 0
Owner = "condor"
RemoteWallClockTime = 0.000000
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
NumSystemHolds = 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion = "$CondorVersion: 6.7.19 May 10 2006 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
RootDir = "/"
Iwd = "/home/condor"
JobUniverse = 1
Cmd = "/home/condor/fortIO.remote"
MinHosts = 1
MaxHosts = 1
CurrentHosts = 0
WantRemoteSyscalls = TRUE
WantCheckpoint = TRUE
JobStatus = 1
EnteredCurrentStatus = 1149147167
JobPrio = 0
User = "
condor@xxxxxxxxxxxxx"
NiceUser = FALSE
MaxJobRetirementTime = 0
Environment = ""
JobNotification = 2
WantRemoteIO = TRUE
UserLog = "/home/condor/fortIO.log"
CoreSize = 0
KillSig = "SIGTSTP"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out = "fortIO.out"
StreamOut = FALSE
Err = "fortIO.err"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "NO"
TransferFiles = "NEVER"
ImageSize_RAW = 13356
ImageSize = 20000
ExecutableSize_RAW = 13356
ExecutableSize = 20000
DiskUsage_RAW = 13356
DiskUsage = 20000
Requirements = (Arch == "INTEL") && (OpSys == "LINUX") && ((CkptArch == Arch) || (CkptArch =?= UNDEFINED)) && ((CkptOpSys == OpSys) || (CkptOpSys =?= UNDEFINED)) && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize)
FileSystemDomain = "gcnode034.cap"
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
>>LeaveJobInQueue = FALSE
Arguments = ""
GlobalJobId = "gcnode034.cap#1149147167#12.0"
ProcId = 0
AutoClusterId = 0
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,DiskUsage,ImageSize,Requirements"
ServerTime = 1149159854
 
MyType = "Job"
TargetType = "Machine"
ClusterId = 13
QDate = 1149147167
CompletionDate = 0
Owner = "condor"
RemoteWallClockTime = 0.000000
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
NumSystemHolds = 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion = "$CondorVersion: 6.7.19 May 10 2006 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
RootDir = "/"
Iwd = "/home/condor"
JobUniverse = 5
Cmd = "/home/condor/sh_loop"
MinHosts = 1
MaxHosts = 1
CurrentHosts = 0
WantRemoteSyscalls = FALSE
WantCheckpoint = FALSE
JobStatus = 1
EnteredCurrentStatus = 1149147167
JobPrio = 0
User = "
condor@xxxxxxxxxxxxx"
NiceUser = FALSE
Environment = ""
JobNotification = 2
WantRemoteIO = TRUE
UserLog = "/home/condor/sh_loop.log"
CoreSize = 0
KillSig = "SIGTERM"
Rank = 0.000000
In = "/dev/null"
TransferIn = FALSE
Out = "sh_loop.out"
StreamOut = FALSE
Err = "sh_loop.err"
StreamErr = FALSE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "IF_NEEDED"
WhenToTransferOutput = "ON_EXIT"
TransferFiles = "ONEXIT"
ImageSize_RAW = 1
ImageSize = 10000
ExecutableSize_RAW = 1
ExecutableSize = 10000
DiskUsage_RAW = 1
DiskUsage = 10000
Requirements = (Arch == "INTEL") && (OpSys == "LINUX") && (Disk >= DiskUsage) && ((Memory * 1024) >= ImageSize) && ((HasFileTransfer) || (TARGET.FileSystemDomain == MY.FileSystemDomain))
FileSystemDomain = "gcnode034.cap"
JobLeaseDuration = 1200
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
>>LeaveJobInQueue = FALSE
Args = "60"
GlobalJobId = "gcnode034.cap#1149147167#13.0"
ProcId = 0
AutoClusterId = 1
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,DiskUsage,ImageSize,FileSystemDomain,Requirements"
ServerTime = 1149159854
 
Best wishes!
_________
Yufang Zhang
2006-06-01