[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] Newbie - Java VM bombing without enough memory



Hi all, and thanks in advance

I've been struggling to get condor running under gentoo, I've got init
scripts working an ps reports all the necessary processes. At the moment
I'm running a test manager, submit, execute pool. On one machine, and no
success. 

When I try to submit my test Java / bash jobs the are evicted from the
queue, 

In Java I get "Could not create java virtual machine", and in bash "not
enough memory to create virtual machine". I've tried the recommendations
I could find in documentation / mail list so far but I've run out of
ideas.   

I've attached the status and below is the shadow failure message from
the logs

5/23 15:50:43 ******************************************************
5/23 15:50:43 ** condor_shadow (CONDOR_SHADOW) STARTING UP
5/23 15:50:43 ** /opt/condor/sbin/condor_shadow
5/23 15:50:43 ** $CondorVersion: 6.8.4 Feb  1 2007 $
5/23 15:50:43 ** $CondorPlatform: I386-LINUX_RHEL3 $
5/23 15:50:43 ** PID = 18535
5/23 15:50:43 ** Log last touched 5/23 15:45:44
5/23 15:50:43 ******************************************************
5/23 15:50:43 Using config source: /opt/condor/etc/condor_config
5/23 15:50:43 Using local config sources:
5/23 15:50:43    /opt/condor/hosts/slatemine/condor_config.local
5/23 15:50:43 DaemonCore: Command Socket at <10.20.9.240:43768>
5/23 15:50:43 Initializing a JAVA shadow for job 8.0
5/23 15:50:43 (8.0) (18535): Request to run on <10.20.9.240:52859> was
ACCEPTED
5/23 15:50:44 (8.0) (18535): Job 8.0 is being evicted
5/23 15:50:44 (8.0) (18535): **** condor_shadow (condor_SHADOW) EXITING
WITH STATUS 107


Any ideas ?




This message should be regarded as confidential. If you have received this email in error please notify the sender and destroy it immediately.
Statements of intent shall only become binding when confirmed in hard copy by an authorised signatory.  The contents of this email may relate to dealings with other companies within the Detica Group plc group of companies.

Detica Limited is registered in England under No: 1337451.

Registered offices: Surrey Research Park, Guildford, Surrey, GU2 7YP, England.


MyType = "Machine"
TargetType = "Job"
Name = "vm1@xxxxxxxxxxxxxxxxx"
Machine = "aBox.mydomain.com"
Rank = Scheduler =?= "DedicatedScheduler@xxxxxxxxxxxxxxxxx"
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "aBox.mydomain.com"
DedicatedScheduler = "DedicatedScheduler@xxxxxxxxxxxxxxxxx"
CondorVersion = "$CondorVersion: 6.8.4 Feb  1 2007 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RHEL3 $"
VirtualMachineID = 1
VirtualMemory = 1047884
Disk = 10729620
CondorLoadAvg = 0.000000
LoadAvg = 0.120000
KeyboardIdle = 0
ConsoleIdle = 0
Memory = 1008
Cpus = 1
StartdIpAddr = "<10.20.9.240:52859>"
Arch = "INTEL"
OpSys = "LINUX"
UidDomain = "aBox.mydomain.com"
FileSystemDomain = "aBox.mydomain.com"
Subnet = "10.20.9"
HasIOProxy = TRUE
CheckpointPlatform = "LINUX INTEL 2.6.x normal"
TotalVirtualMemory = 2095768
TotalDisk = 21459240
TotalCpus = 2
TotalMemory = 2017
KFlops = 775895
Mips = 2371
LastBenchmark = 1179931251
TotalLoadAvg = 0.120000
TotalCondorLoadAvg = 0.000000
ClockMin = 950
ClockDay = 3
TotalVirtualMachines = 2
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJobDeferral = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Sun Microsystems Inc."
JavaVersion = "1.5.0_10"
JavaMFlops = 189.330658
HasJava = TRUE
HasPVM = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJobDeferral,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasPVM,HasRemoteSyscalls,HasCheckpointing"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Unclaimed"
EnteredCurrentState = 1179931844
Activity = "Idle"
EnteredCurrentActivity = 1179931844
Start = TRUE
Requirements = (START) && (IsValidCheckpointPlatform)
IsValidCheckpointPlatform = (((TARGET.JobUniverse == 1) == FALSE) || ((MY.CheckpointPlatform =!= UNDEFINED) && ((TARGET.LastCheckpointPlatform =?= MY.CheckpointPlatform) || (TARGET.NumCkpts == 0))))
MaxJobRetirementTime = 0
CurrentRank = 0.000000
MonitorSelfTime = 1179931731
MonitorSelfCPUUsage = 0.000000
MonitorSelfImageSize = 8152.000000
MonitorSelfResidentSetSize = 3776
MonitorSelfAge = 0
MonitorSelfRegisteredSocketCount = 2
DaemonStartTime = 1179931245
UpdateSequenceNumber = 5
MyAddress = "<10.20.9.240:52859>"
LastHeardFrom = 1179931855
UpdatesTotal = 6
UpdatesSequenced = 5
UpdatesLost = 0
UpdatesHistory = "0x00000000000000000000000000000000"

MyType = "Machine"
TargetType = "Job"
Name = "vm2@xxxxxxxxxxxxxxxxx"
Machine = "aBox.mydomain.com"
Rank = Scheduler =?= "DedicatedScheduler@xxxxxxxxxxxxxxxxx"
CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)
COLLECTOR_HOST_STRING = "aBox.mydomain.com"
DedicatedScheduler = "DedicatedScheduler@xxxxxxxxxxxxxxxxx"
CondorVersion = "$CondorVersion: 6.8.4 Feb  1 2007 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RHEL3 $"
VirtualMachineID = 2
VirtualMemory = 1047884
Disk = 10729620
CondorLoadAvg = 0.000000
LoadAvg = 0.000000
KeyboardIdle = 0
ConsoleIdle = 0
Memory = 1008
Cpus = 1
StartdIpAddr = "<10.20.9.240:52859>"
Arch = "INTEL"
OpSys = "LINUX"
UidDomain = "aBox.mydomain.com"
FileSystemDomain = "aBox.mydomain.com"
Subnet = "10.20.9"
HasIOProxy = TRUE
CheckpointPlatform = "LINUX INTEL 2.6.x normal"
TotalVirtualMemory = 2095768
TotalDisk = 21459240
TotalCpus = 2
TotalMemory = 2017
KFlops = 775895
Mips = 2371
LastBenchmark = 1179931251
TotalLoadAvg = 0.120000
TotalCondorLoadAvg = 0.000000
ClockMin = 950
ClockDay = 3
TotalVirtualMachines = 2
HasFileTransfer = TRUE
HasPerFileEncryption = TRUE
HasReconnect = TRUE
HasMPI = TRUE
HasTDP = TRUE
HasJobDeferral = TRUE
HasJICLocalConfig = TRUE
HasJICLocalStdin = TRUE
JavaVendor = "Sun Microsystems Inc."
JavaVersion = "1.5.0_10"
JavaMFlops = 189.330658
HasJava = TRUE
HasPVM = TRUE
HasRemoteSyscalls = TRUE
HasCheckpointing = TRUE
StarterAbilityList = "HasFileTransfer,HasPerFileEncryption,HasReconnect,HasMPI,HasTDP,HasJobDeferral,HasJICLocalConfig,HasJICLocalStdin,HasJava,HasPVM,HasRemoteSyscalls,HasCheckpointing"
CpuBusyTime = 0
CpuIsBusy = FALSE
TimeToLive = 2147483647
State = "Unclaimed"
EnteredCurrentState = 1179931846
Activity = "Idle"
EnteredCurrentActivity = 1179931846
Start = TRUE
Requirements = (START) && (IsValidCheckpointPlatform)
IsValidCheckpointPlatform = (((TARGET.JobUniverse == 1) == FALSE) || ((MY.CheckpointPlatform =!= UNDEFINED) && ((TARGET.LastCheckpointPlatform =?= MY.CheckpointPlatform) || (TARGET.NumCkpts == 0))))
MaxJobRetirementTime = 0
CurrentRank = 0.000000
MonitorSelfTime = 1179931731
MonitorSelfCPUUsage = 0.000000
MonitorSelfImageSize = 8152.000000
MonitorSelfResidentSetSize = 3776
MonitorSelfAge = 0
MonitorSelfRegisteredSocketCount = 2
DaemonStartTime = 1179931245
UpdateSequenceNumber = 6
MyAddress = "<10.20.9.240:52859>"
LastHeardFrom = 1179931856
UpdatesTotal = 7
UpdatesSequenced = 6
UpdatesLost = 0
UpdatesHistory = "0x00000000000000000000000000000000"