[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [HTCondor-users] Windows job won't run, targeted machines get to "Matched" but nothing more



Hi Jordan,

After we merge python-condor into the main HTCondor repo, I stopped updating github.  Before the 7.9.4 release, there were several improvements to the submit functions to avoid issues like you describe.

Is it possible to switch the submitter to 7.9.4?  It may bypass having to debug the issue on known older code.

Brian

On Mar 19, 2013, at 11:48 AM, Jordan Williamson <jordan.williamson@xxxxxxxxxxx> wrote:

So I've been trying to figure out this problem for a few days now, and I've pretty much exhausted all the solutions I can think of.

I'm trying to use Python-Condor (github repo) and Condor v.7.8.7/6 to submit jobs to a Windows 7 machine (x86_64). The crazy thing is, if I submit a job using condor_submit, it runs fine, but using python-condor with what I believe is the exact same classads, it refuses to run. I've tried using the -dump argument with condor_submit and copying those classads directly, but something about submitting the job through Python-Condor seems to have prevented the job from starting.

I can see on both the targeted Windows execute node that the job is being matched to the right machine, but doesn't progress any further than that.

Relevant information:

condor_q -l (job submitted through condor_submit)
BufferSize = 524288
NiceUser = false
CoreSize = 0
CumulativeSlotTime = 0
>GlobalJobId = "10-215-91-146.ec2.internal#19.0#1363711460"
RequestCpus = 1
TransferOut = false
Err = "/dev/null"
BufferBlockSize = 32768
ImageSize = 1
CurrentTime = time()
WantCheckpoint = false
CommittedTime = 0
TargetType = "Machine"
WhenToTransferOutput = "ON_EXIT"
ServerTime = 1363711465
Cmd = "/home/ubuntu/echo.bat"
JobUniverse = 5
ExitBySignal = false
TransferIn = false
Iwd = "/home/ubuntu"
NumRestarts = 0
CommittedSuspensionTime = 0
Owner = "ubuntu"
NumSystemHolds = 0
CumulativeSuspensionTime = 0
TransferErr = false
Environment = ""
RequestDisk = DiskUsage
Requirements = ( ( OpSys == "WINDOWS" && Arch == "x86_64" ) ) && ( TARGET.Disk >= RequestDisk ) && ( TARGET.Memory >= RequestMemory ) && ( TARGET.HasFileTransfer )
MinHosts = 1
JobNotification = 2
NumCkpts = 0
LastSuspensionTime = 0
NumJobStarts = 0
WantRemoteSyscalls = false
JobLeaseDuration = 1200
ImageSize_RAW = 1
JobPrio = 0
RootDir = "/"
CurrentHosts = 0
WantRemoteIO = true
DiskUsage_RAW = 1
>DiskUsage = 1
In = "/dev/null"
PeriodicRemove = false
ExecutableSize = 1
RemoteUserCpu = 0.0
LocalUserCpu = 0.0
LastRejMatchTime = 1363711460
RemoteSysCpu = 0.0
LocalSysCpu = 0.0
ClusterId = 19
CompletionDate = 0
RemoteWallClockTime = 0.0
Rank = 0.0
LeaveJobInQueue = false
MyType = "Job"
CondorVersion = "$CondorVersion: 7.8.6 Oct 24 2012 BuildID: 73238 $"
LastRejMatchReason = "no match found"
NumCkpts_RAW = 0
ProcId = 0
PeriodicHold = false
User = "ubuntu@xxxxxxxxxxxxxxxxxx"
LastJobStatus = 0
Arguments = ""
Out = "/dev/null"
JobStatus = 1
ExecutableSize_RAW = 1
PeriodicRelease = false
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,jordan,DiskUsage,ImageSize,RequestDisk,RequestMemory,Requirements,NiceUser,ConcurrencyLimits"
RequestMemory = ifthenelse(MemoryUsage =!= undefined,MemoryUsage,( ImageSize + 1023 ) / 1024)
MaxHosts = 1
TotalSuspensions = 0
CommittedSlotTime = 0
CondorPlatform = "$CondorPlatform: x86_64_deb_6.0 $"
AutoClusterId = 1
ShouldTransferFiles = "YES"
ExitStatus = 0
QDate = 1363711460
EnteredCurrentStatus = 1363711460

condor_q -l (job submitted through python-condor)

DiskUsage_RAW = 1
BufferSize = 524288
Requirements = ( ( OpSys == "WINDOWS" && Arch == "x86_64" ) ) && ( TARGET.Disk >= RequestDisk ) && ( TARGET.Memory >= RequestMemory ) && ( TARGET.HasFileTransfer )
NiceUser = false
CoreSize = 0
RemoteUserCpu = 0.0
>GlobalJobId = "ip-10-242-101-14.ec2.internal#641.0#1363710358"
RequestCpus = 1
NumJobStarts = 0
TransferOut = false
Err = "/dev/null"
ImageSize = 1
CurrentTime = time()
JobLeaseDuration = 1200
CurrentHosts = 0
TotalSuspensions = 0
TargetType = "Machine"
WantRemoteSyscalls = false
WantRemoteIO = true
ServerTime = 1363711521
Cmd = "/home/ubuntu/echo.bat"
>JobUniverse = 5
RemoteWallClockTime = 0.0
TransferIn = false
CondorVersion = "$CondorVersion: 7.8.6 Oct 24 2012 BuildID: 73238 $"
JobNotification = 2
LocalUserCpu = 0.0
Iwd = "/home/ubuntu"
RemoteSysCpu = 0.0
NumRestarts = 0
NumSystemHolds = 0
LastRejMatchTime = 1363711231
LastJobStatus = 0
Owner = "ubuntu"
CondorPlatform = "$CondorPlatform: x86_64_deb_6.0 $"
TransferErr = false
RequestMemory = ifthenelse(MemoryUsage =!= undefined,MemoryUsage,( ImageSize + 1023 ) / 1024)
Environment = ""
ExecutableSize_RAW = 1
RequestDisk = DiskUsage
MinHosts = 1
WhenToTransferOutput = "ON_EXIT"
EnteredCurrentStatus = 1363709957
CommittedSuspensionTime = 0
NumCkpts_RAW = 0
NumCkpts = 0
PeriodicHold = false
AutoClusterId = 3
CheckpointPlatform = ""
RootDir = "/"
JobPrio = 0
BufferBlockSize = 32768
PeriodicRemove = false
PeriodicRelease = false
LeaveJobInQueue = false
LastRejMatchReason = "no match found"
CommittedSlotTime = 0
DiskUsage = 1
In = "/dev/null"
AutoClusterAttrs = "JobUniverse,LastCheckpointPlatform,NumCkpts,jordan,DiskUsage,ImageSize,RequestDisk,RequestMemory,Requirements,NiceUser,ConcurrencyLimits"
ExecutableSize = 1
LocalSysCpu = 0.0
CommittedTime = 0
ClusterId = 641
CumulativeSuspensionTime = 0
CompletionDate = 0
Rank = 0.0
WantCheckpoint = false
CumulativeSlotTime = 0
MyType = "Job"
ProcId = 0
User = "ubuntu@xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
ExitBySignal = false
Arguments = ""
LastSuspensionTime = 0
Out = "/dev/null"
JobStatus = 1
MaxHosts = 1
ImageSize_RAW = 1
ShouldTransferFiles = "YES"
ExitStatus = 0
QDate = 1363709957

condor_status -l (for Windows machine; Note: this machine has several slots, I will just give one of those here)

TotalCondorLoadAvg = 0.0
MonitorSelfTime = 1363711484
OpSys = "WINDOWS"
DetectedCpus = 4
Mips = 7402
LastFetchWorkSpawned = 0
AuthenticatedIdentity = "unauthenticated@unmapped"
HibernationState = "NONE"
TotalLoadAvg = 0.070000
HasMPI = true
OpSysAndVer = "WINDOWS601"
MyType = "Machine"
Rank = 0.0
LoadAvg = 0.0
VirtualMemory = 1048575
SlotWeight = Cpus
CheckpointPlatform = "WINDOWS X86_64 Unknown normal"
WindowsBuildNumber = 7601
HardwareAddress = ""
HasJobDeferral = true
TotalSlots = 4
TotalDisk = 22476588
HasJICLocalStdin = true
WindowsServicePackMajorVersion = 1
TotalSlotDisk = 5628964
HasFileTransferPluginMethods = "data"
Requirements = false
NumPids = 0
ClockMin = 765
UpdatesTotal = 47
HasJICLocalConfig = true
UtsnameVersion = undefined
CpuIsBusy = false
CpuBusy = ( ( LoadAvg - CondorLoadAvg ) >= 0.500000 )
UidDomain = "Skyhold"
Arch = "X86_64"
ExpectedMachineQuickDrainingCompletion = 1363711556
WakeOnLanEnabledFlags = "NONE"
TargetType = "Job"
HasFileTransfer = true
Account_name = ""
SubnetMask = "255.255.255.255"
KeyboardIdle = 1223
CondorPlatform = "$CondorPlatform: x86_64_winnt_6.1 $"
WindowsProductType = 1
TotalTimeOwnerIdle = 5
OpSysVer = 601
CpuBusyTime = 0
StartdIpAddr = "<172.16.0.100:50904>"
StarterAbilityList = "HasFileTransferPluginMethods,HasJobDeferral,HasTDP,HasFileTransfer,HasJICLocalConfig,HasVM,HasWindowsRunAsOwner,HasReconnect,HasMPI,HasPerFileEncryption,HasJICLocalStdin"
RecentDaemonCoreDutyCycle = 0.005685
HasReconnect = true
CanHibernate = true
DaemonCoreDutyCycle = 0.009656
TotalMemory = 3979
Machine = "Skyhold"
UpdatesLost = 0
MonitorSelfAge = 3846
MaxJobRetirementTime = 0
Disk = 5619147
Unhibernate = MY.MachineLastMatchTime =!= undefined
IsWakeOnLanEnabled = false
CondorVersion = "$CondorVersion: 7.8.7 Dec 12 2012 BuildID: 86173 $"
ExpectedMachineGracefulDrainingBadput = 0
LastHeardFrom = 1363711526
Activity = "Idle"
TotalTimeMatchedIdle = 1471
TotalSlotMemory = 994
TotalVirtualMemory = 4194303
OpSysName = "Windows7"
TotalTimeClaimedIdle = 3
TotalCpus = 4
HasVM = false
TotalSlotCpus = 1
IsWakeAble = false
Name = "slot4@Skyhold"
TotalTimeClaimedBusy = 21
SlotID = 4
HibernationSupportedStates = "S3,S4,S5"
MonitorSelfRegisteredSocketCount = 1
Price = 0.050000
MonitorSelfSecuritySessions = 12
COLLECTOR_HOST_STRING = "cm.dychron.com"
IsWakeOnLanSupported = false
UtsnameRelease = undefined
OpSysShortName = "Win7"
OpSysMajorVer = 601
FileSystemDomain = "Skyhold"
UpdatesSequenced = 44
EnteredCurrentState = 1363711550
CondorLoadAvg = 0.0
HasWindowsRunAsOwner = true
MonitorSelfImageSize = 101156.000000
ConsoleIdle = 1223
DaemonStartTime = 1363707638
State = "Matched"
UtsnameSysname = undefined
MonitorSelfResidentSetSize = 14968
UtsnameMachine = undefined
TotalTimeUnclaimedIdle = 2419
DetectedMemory = 3979
HasTDP = true
CurrentRank = 0.0
WindowsMinorVersion = 1
UtsnameNodename = undefined
LastUpdate = 1363707682
Cpus = 1
Memory = 994
HasIOProxy = true
TimeToLive = 2147483647
OpSysLegacy = "WINNT61"
KFlops = 678780
LastFetchWorkCompleted = 0
MyCurrentTime = 1363711557
MachineMaxVacateTime = 10 * 60
NextFetchWorkDelay = -1
ExpectedMachineQuickDrainingBadput = 0
UpdatesHistory = "0x00000000000000000000000000000000"
MonitorSelfCPUUsage = 0.226853
LastBenchmark = 1363707682
ExpectedMachineGracefulDrainingCompletion = 1363711556
CurrentTime = time()
WindowsServicePackMinorVersion = 0
UpdateSequenceNumber = 42
WakeOnLanSupportedFlags = "NONE"
OpSysLongName = "Windows 7 SP1"
ClockDay = 2
MyAddress = "<172.16.0.100:50904>"
HibernationLevel = 0
HasPerFileEncryption = true
EnteredCurrentActivity = 1363711550
WindowsMajorVersion = 6
DotNetVersions = "2.0,3.0,3.5,4.0Client,4.0Full"

Some config options on the Windows machine:

START = True
SUSPEND = False
CONTINUE = True
PREEMPT = False
KILL = False

---
Thanks,
Jordan Williamson
_______________________________________________
HTCondor-users mailing list
To unsubscribe, send a message to htcondor-users-request@xxxxxxxxxxx with a
subject: Unsubscribe
You can also unsubscribe by visiting
https://lists.cs.wisc.edu/mailman/listinfo/htcondor-users

The archives can be found at:
https://lists.cs.wisc.edu/archive/htcondor-users/

Attachment: smime.p7s
Description: S/MIME cryptographic signature