[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[HTCondor-users] Windows job won't run, targeted machines get to "Matched" but nothing more



So I've been trying to figure out this problem for a few days now, and I've pretty much exhausted all the solutions I can think of.

I'm trying to use Python-Condor (github repo) and Condor v.7.8.7/6 to submit jobs to a Windows 7 machine (x86_64). The crazy thing is, if I submit a job using condor_submit, it runs fine, but using python-condor with what I believe is the exact same classads, it refuses to run. I've tried using the -dump argument with condor_submit and copying those classads directly, but something about submitting the job through Python-Condor seems to have prevented the job from starting.

I can see on both the targeted Windows execute node that the job is being matched to the right machine, but doesn't progress any further than that.

Relevant information:

condor_q -l (job submitted through condor_submit)
BufferSize = 524288
NiceUser = false
CoreSize = 0
CumulativeSlotTime = 0
>GlobalJobId = "10-215-91-146.ec2.internal#19.0#1363711460"
RequestCpus = 1
TransferOut = false
Err = "/dev/null"
BufferBlockSize = 32768
ImageSize = 1
CurrentTime = time()
WantCheckpoint = false
CommittedTime = 0
TargetType = "Machine"
WhenToTransferOutput = "ON_EXIT"
ServerTime = 1363711465
Cmd = "/home/ubuntu/echo.bat"
JobUniverse = 5
ExitBySignal = false
TransferIn = false
Iwd = "/home/ubuntu"
NumRestarts = 0
CommittedSuspensionTime = 0
Owner = "ubuntu"
NumSystemHolds = 0
CumulativeSuspensionTime = 0
TransferErr = false
Environment = ""
RequestDisk = DiskUsage
Requirements = ( ( OpSys == "WINDOWS" && Arch == "x86_64" ) ) && ( TARGET.Disk >= RequestDisk ) && ( TARGET.Memory >= RequestMemory ) && ( TARGET.HasFileTransfer )
MinHosts = 1
JobNotification = 2
NumCkpts = 0
LastSuspensionTime = 0
NumJobStarts = 0
WantRemoteSyscalls = false
JobLeaseDuration = 1200
ImageSize_RAW = 1
JobPrio = 0
RootDir = "/"
CurrentHosts = 0
WantRemoteIO = true
DiskUsage_RAW = 1
>DiskUsage = 1
In = "/dev/null"
PeriodicRemove = false
ExecutableSize = 1
RemoteUserCpu = 0.0
LocalUserCpu = 0.0
LastRejMatchTime = 1363711460
RemoteSysCpu = 0.0
LocalSysCpu = 0.0
ClusterId = 19
CompletionDate = 0
RemoteWallClockTime = 0.0
Rank = 0.0
LeaveJobInQueue = false
MyType = "Job"
CondorVersion = "$CondorVersion: 7.8.6 Oct 24 2012 BuildID: 73238 $"
LastRejMatchReason = "no match found"
NumCkpts_RAW = 0
ProcId = 0
PeriodicHold = false
User = "ubuntu@xxxxxxxxxxxxxxxxxx"
LastJobStatus = 0
Arguments = ""
Out = "/dev/null"
JobStatus = 1
ExecutableSize_RAW = 1
PeriodicRelease = false
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,jordan,DiskUsage,ImageSize,RequestDisk,RequestMemory,Requirements,NiceUser,ConcurrencyLimits"
RequestMemory = ifthenelse(MemoryUsage =!= undefined,MemoryUsage,( ImageSize + 1023 ) / 1024)
MaxHosts = 1
TotalSuspensions = 0
CommittedSlotTime = 0
CondorPlatform = "$CondorPlatform: x86_64_deb_6.0 $"
AutoClusterId = 1
ShouldTransferFiles = "YES"
ExitStatus = 0
QDate = 1363711460
EnteredCurrentStatus = 1363711460

condor_q -l (job submitted through python-condor)

DiskUsage_RAW = 1
BufferSize = 524288
Requirements = ( ( OpSys == "WINDOWS" && Arch == "x86_64" ) ) && ( TARGET.Disk >= RequestDisk ) && ( TARGET.Memory >= RequestMemory ) && ( TARGET.HasFileTransfer )
NiceUser = false
CoreSize = 0
RemoteUserCpu = 0.0
>GlobalJobId = "ip-10-242-101-14.ec2.internal#641.0#1363710358"
RequestCpus = 1
NumJobStarts = 0
TransferOut = false
Err = "/dev/null"
ImageSize = 1
CurrentTime = time()
JobLeaseDuration = 1200
CurrentHosts = 0
TotalSuspensions = 0
TargetType = "Machine"
WantRemoteSyscalls = false
WantRemoteIO = true
ServerTime = 1363711521
Cmd = "/home/ubuntu/echo.bat"
>JobUniverse = 5
RemoteWallClockTime = 0.0
TransferIn = false
CondorVersion = "$CondorVersion: 7.8.6 Oct 24 2012 BuildID: 73238 $"
JobNotification = 2
LocalUserCpu = 0.0
Iwd = "/home/ubuntu"
RemoteSysCpu = 0.0
NumRestarts = 0
NumSystemHolds = 0
LastRejMatchTime = 1363711231
LastJobStatus = 0
Owner = "ubuntu"
CondorPlatform = "$CondorPlatform: x86_64_deb_6.0 $"
TransferErr = false
RequestMemory = ifthenelse(MemoryUsage =!= undefined,MemoryUsage,( ImageSize + 1023 ) / 1024)
Environment = ""
ExecutableSize_RAW = 1
RequestDisk = DiskUsage
MinHosts = 1
WhenToTransferOutput = "ON_EXIT"
EnteredCurrentStatus = 1363709957
CommittedSuspensionTime = 0
NumCkpts_RAW = 0
NumCkpts = 0
PeriodicHold = false
AutoClusterId = 3
CheckpointPlatform = ""
RootDir = "/"
JobPrio = 0
BufferBlockSize = 32768
PeriodicRemove = false
PeriodicRelease = false
LeaveJobInQueue = false
LastRejMatchReason = "no match found"
CommittedSlotTime = 0
DiskUsage = 1
In = "/dev/null"
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,jordan,DiskUsage,ImageSize,RequestDisk,RequestMemory,Requirements,NiceUser,ConcurrencyLimits"
ExecutableSize = 1
LocalSysCpu = 0.0
CommittedTime = 0
ClusterId = 641
CumulativeSuspensionTime = 0
CompletionDate = 0
Rank = 0.0
WantCheckpoint = false
CumulativeSlotTime = 0
MyType = "Job"
ProcId = 0
User = "ubuntu@xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
ExitBySignal = false
Arguments = ""
LastSuspensionTime = 0
Out = "/dev/null"
JobStatus = 1
MaxHosts = 1
ImageSize_RAW = 1
ShouldTransferFiles = "YES"
ExitStatus = 0
QDate = 1363709957

condor_status -l (for Windows machine; Note: this machine has several slots, I will just give one of those here)

TotalCondorLoadAvg = 0.0
MonitorSelfTime = 1363711484
OpSys = "WINDOWS"
DetectedCpus = 4
Mips = 7402
LastFetchWorkSpawned = 0
AuthenticatedIdentity = "unauthenticated@unmapped"
HibernationState = "NONE"
TotalLoadAvg = 0.070000
HasMPI = true
OpSysAndVer = "WINDOWS601"
MyType = "Machine"
Rank = 0.0
LoadAvg = 0.0
VirtualMemory = 1048575
SlotWeight = Cpus
CheckpointPlatform = "WINDOWS X86_64 Unknown normal"
WindowsBuildNumber = 7601
HardwareAddress = ""
HasJobDeferral = true
TotalSlots = 4
TotalDisk = 22476588
HasJICLocalStdin = true
WindowsServicePackMajorVersion = 1
TotalSlotDisk = 5628964
HasFileTransferPluginMethods = "data"
Requirements = false
NumPids = 0
ClockMin = 765
UpdatesTotal = 47
HasJICLocalConfig = true
UtsnameVersion = undefined
CpuIsBusy = false
CpuBusy = ( ( LoadAvg - CondorLoadAvg ) >= 0.500000 )
UidDomain = "Skyhold"
Arch = "X86_64"
ExpectedMachineQuickDrainingCompletion = 1363711556
WakeOnLanEnabledFlags = "NONE"
TargetType = "Job"
HasFileTransfer = true
Account_name = ""
SubnetMask = "255.255.255.255"
KeyboardIdle = 1223
CondorPlatform = "$CondorPlatform: x86_64_winnt_6.1 $"
WindowsProductType = 1
TotalTimeOwnerIdle = 5
OpSysVer = 601
CpuBusyTime = 0
StartdIpAddr = "<172.16.0.100:50904>"
StarterAbilityList = "HasFileTransferPluginMethods,HasJobDeferral,HasTDP,HasFileTransfer,HasJICLocalConfig,HasVM,HasWindowsRunAsOwner,HasReconnect,HasMPI,HasPerFileEncryption,HasJICLocalStdin"
RecentDaemonCoreDutyCycle = 0.005685
HasReconnect = true
CanHibernate = true
DaemonCoreDutyCycle = 0.009656
TotalMemory = 3979
Machine = "Skyhold"
UpdatesLost = 0
MonitorSelfAge = 3846
MaxJobRetirementTime = 0
Disk = 5619147
Unhibernate = MY.MachineLastMatchTime =!= undefined
IsWakeOnLanEnabled = false
CondorVersion = "$CondorVersion: 7.8.7 Dec 12 2012 BuildID: 86173 $"
ExpectedMachineGracefulDrainingBadput = 0
LastHeardFrom = 1363711526
Activity = "Idle"
TotalTimeMatchedIdle = 1471
TotalSlotMemory = 994
TotalVirtualMemory = 4194303
OpSysName = "Windows7"
TotalTimeClaimedIdle = 3
TotalCpus = 4
HasVM = false
TotalSlotCpus = 1
IsWakeAble = false
Name = "slot4@Skyhold"
TotalTimeClaimedBusy = 21
SlotID = 4
HibernationSupportedStates = "S3,S4,S5"
MonitorSelfRegisteredSocketCount = 1
Price = 0.050000
MonitorSelfSecuritySessions = 12
COLLECTOR_HOST_STRING = "cm.dychron.com"
IsWakeOnLanSupported = false
UtsnameRelease = undefined
OpSysShortName = "Win7"
OpSysMajorVer = 601
FileSystemDomain = "Skyhold"
UpdatesSequenced = 44
EnteredCurrentState = 1363711550
CondorLoadAvg = 0.0
HasWindowsRunAsOwner = true
MonitorSelfImageSize = 101156.000000
ConsoleIdle = 1223
DaemonStartTime = 1363707638
State = "Matched"
UtsnameSysname = undefined
MonitorSelfResidentSetSize = 14968
UtsnameMachine = undefined
TotalTimeUnclaimedIdle = 2419
DetectedMemory = 3979
HasTDP = true
CurrentRank = 0.0
WindowsMinorVersion = 1
UtsnameNodename = undefined
LastUpdate = 1363707682
Cpus = 1
Memory = 994
HasIOProxy = true
TimeToLive = 2147483647
OpSysLegacy = "WINNT61"
KFlops = 678780
LastFetchWorkCompleted = 0
MyCurrentTime = 1363711557
MachineMaxVacateTime = 10 * 60
NextFetchWorkDelay = -1
ExpectedMachineQuickDrainingBadput = 0
UpdatesHistory = "0x00000000000000000000000000000000"
MonitorSelfCPUUsage = 0.226853
LastBenchmark = 1363707682
ExpectedMachineGracefulDrainingCompletion = 1363711556
CurrentTime = time()
WindowsServicePackMinorVersion = 0
UpdateSequenceNumber = 42
WakeOnLanSupportedFlags = "NONE"
OpSysLongName = "Windows 7 SP1"
ClockDay = 2
MyAddress = "<172.16.0.100:50904>"
HibernationLevel = 0
HasPerFileEncryption = true
EnteredCurrentActivity = 1363711550
WindowsMajorVersion = 6
DotNetVersions = "2.0,3.0,3.5,4.0Client,4.0Full"

Some config options on the Windows machine:

START = True
SUSPEND = False
CONTINUE = True
PREEMPT = False
KILL = False

---
Thanks,
Jordan Williamson