[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [condor-users] How to set up the MPI Universe in windows?



Ok, I have set up two computers (P04 & P05) running windows 2000, both
dedicated resources, with P04 as the dedicated scheduler.  When I run an mpi
job, the job stays idle, log files show that the computers are rejecting the
job based on the their requirements. I have not been able to figure out
which requirements are causing the job not to run. Below is and ad-file for
the job and for one of the computers.



Does anybody see why these are not matching up?



Thanks



Cliff Padgett



Job ad follows

-- Submitter: P04.Cluster.mte.ncsu.edu : <10.0.0.4:1042> :
P04.Cluster.mte.ncsu.edu

MyType = "Job"

TargetType = "Machine"

ClusterId = 2

QDate = 1083004477

CompletionDate = 0

Owner = "cwpadget"

NTDomain = "PRECISION-1"

RemoteWallClockTime = 0.000000

LocalUserCpu = 0.000000

LocalSysCpu = 0.000000

RemoteUserCpu = 0.000000

RemoteSysCpu = 0.000000

ExitStatus = 0

NumCkpts = 0

NumRestarts = 0

NumSystemHolds = 0

CommittedTime = 0

TotalSuspensions = 0

LastSuspensionTime = 0

CumulativeSuspensionTime = 0

ExitBySignal = FALSE

CondorVersion = "$CondorVersion: 6.6.2 Mar 17 2004 $"

CondorPlatform = "$CondorPlatform: INTEL-WINNT40 $"

Iwd = "C:\"

JobUniverse = 8

Cmd = "c:\sdp.exe"

CurrentHosts = 0

WantRemoteSyscalls = FALSE

WantCheckpoint = FALSE

MinHosts = 2

MaxHosts = 2

JobStatus = 1

EnteredCurrentStatus = 1083004478

JobPrio = 0

User = "cwpadget@xxxxxxx*"

NiceUser = FALSE

Env = ""

JobNotification = 2

CoreSize = 4556680

Rank = 0.000000

In = "/dev/null"

TransferIn = FALSE

Out = "/dev/null"

TransferOut = FALSE

Err = "/dev/null"

TransferErr = FALSE

BufferSize = 524288

BufferBlockSize = 32768

ShouldTransferFiles = "YES"

WhenToTransferOutput = "ON_EXIT"

TransferFiles = "ONEXIT"

TransferInput = "C:\WINNT\System32\mpich.dll"

ImageSize = 357

ExecutableSize = 357

DiskUsage = 1005

Requirements = (Arch == "INTEL") && (OpSys == "WINNT50") && (Disk >=
DiskUsage) && ((Memory * 1024) >= ImageSize) && (HasMPI) &&
(HasFileTransfer)

PeriodicHold = FALSE

PeriodicRelease = FALSE

PeriodicRemove = FALSE

OnExitHold = FALSE

OnExitRemove = TRUE

LeaveJobInQueue = FALSE

Args = ""

ProcId = 0

Scheduler = "DedicatedScheduler@xxxxxxxxxxxxxxxxxxxxxxxx"

ServerTime = 1083004928





Computer ad follows

MyType = "Machine"

TargetType = "Job"

Name = "vm1@xxxxxxxxxxxxxxxxxxxxxxxx"

Machine = "P04.Cluster.mte.ncsu.edu"

Rank = Scheduler =?= "DedicatedScheduler@xxxxxxxxxxxxxxxxxxxxxxxx"

CpuBusy = ((LoadAvg - CondorLoadAvg) >= 0.500000)

COLLECTOR_HOST_STRING = "p04"

DedicatedScheduler = "DedicatedScheduler@xxxxxxxxxxxxxxxxxxxxxxxx"

CondorVersion = "$CondorVersion: 6.6.2 Mar 17 2004 $"

CondorPlatform = "$CondorPlatform: INTEL-WINNT40 $"

VirtualMachineID = 1

VirtualMemory = 1963100

Disk = 14657153

CondorLoadAvg = 0.000000

LoadAvg = 0.000000

KeyboardIdle = 0

ConsoleIdle = 0

Memory = 1024

Cpus = 1

StartdIpAddr = "<10.0.0.4:1041>"

Arch = "INTEL"

OpSys = "WINNT50"

UidDomain = "10.0.0.*"

FileSystemDomain = "10.0.0.*"

Subnet = "10.0.0"

HasIOProxy = TRUE

TotalVirtualMemory = 3926200

TotalDisk = 29314306

KFlops = 584397

Mips = 1916

LastBenchmark = 1083003533

TotalLoadAvg = 0.020000

TotalCondorLoadAvg = 0.000000

ClockMin = 878

ClockDay = 1

TotalVirtualMachines = 2

HasFileTransfer = TRUE

HasMPI = TRUE

HasJICLocalConfig = TRUE

HasJICLocalStdin = TRUE

StarterAbilityList =
"HasFileTransfer,HasMPI,HasJICLocalConfig,HasJICLocalStdin"

CpuBusyTime = 0

CpuIsBusy = FALSE

State = "Owner"

EnteredCurrentState = 1083003526

Activity = "Idle"

EnteredCurrentActivity = 1083003526

Start = Scheduler =?= "DedicatedScheduler@xxxxxxxxxxxxxxxxxxxxxxxx"

Requirements = START

CurrentRank = 0.000000

DaemonStartTime = 1083003525

UpdateSequenceNumber = 4

MyAddress = "<10.0.0.4:1041>"

LastHeardFrom = 1083004737

UpdatesTotal = 5

UpdatesSequenced = 4

UpdatesLost = 0

UpdatesHistory = "0x00000000000000000000000000000000"

----- Original Message ----- 
From: "Mark Silberstein" <marks@xxxxxxxxxxxxxxxxxxxxxxx>
To: <condor-users@xxxxxxxxxxx>
Sent: Friday, April 23, 2004 3:26 PM
Subject: Re: [condor-users] How to set up the MPI Universe in windows?


> The setup for Windows and for *NIX is just the same. Just take that
> "condor_config.local.dedicated.resource" file and put it on Windows.
>
>
> On Wed, 2004-04-14 at 15:56, Cliff Padgett wrote:
> > Hello, Im trying to run condor on a Beowulf like cluster of dell
> > computers (all running windows 2000 and MPICH).  So far Ive been
> > successful at running in vanilla universe, but all attempts at running
> > under the mpi universe fail.  Well, the jobs dont crash they just stay
> > idle.  Condor_q analyze says that I have N match, but prefer another
> > specific job despite its worse user-priority.   The condor manual says
> > I need to have dedicated resource in order to run under the mpi
> > universe and suggests I look at condor_config.local.dedicated.resource
> > file, however this file doesnt seem to be part of the
> > condor-6.6.4-winnt40-x86.exe install?
> >
> >
> >
> > Is there any place I can get detailed instruction on how set up an mpi
> > universe under windows?  Ive also noticed that parts of the condor
> > manual says that windows only works under the vanilla universe and
> > parts of it says it also works under mpi?
> >
> >
>
> Condor Support Information:
> http://www.cs.wisc.edu/condor/condor-support/
> To Unsubscribe, send mail to majordomo@xxxxxxxxxxx with
> unsubscribe condor-users <your_email_address>
>
>

Condor Support Information:
http://www.cs.wisc.edu/condor/condor-support/
To Unsubscribe, send mail to majordomo@xxxxxxxxxxx with
unsubscribe condor-users <your_email_address>