[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] Jobs that won't condor_rm




I have some jobs that users submitted to globus universe
in the queue of my schedd right now and I cannot, as root, remove
them with condor_rm.
When I execute the condor_rm command, they show "X" for a few
seconds and then revert to status "H"

000 (26058.000.000) 12/21 09:11:17 Job submitted from host: <131.225.167.42:347
84>
...
017 (26058.000.000) 12/21 09:11:26 Job submitted to Globus
    RM-Contact: fngp-osg.fnal.gov/jobmanager-condor
    JM-Contact: fngp-osg.fnal.gov/jobmanager-condor
    Can-Restart-JM: 1
...
001 (26058.000.000) 12/21 09:14:00 Job executing on host: gt2 fngp-osg.fnal.gov
/jobmanager-condor
...
012 (26058.000.000) 12/25 10:14:50 Job was held.
Globus error 22: the job manager failed to create an internal script argument file
        Code 2 Subcode 22
...
012 (26058.000.000) 12/28 14:22:04 Job was held.
        Globus error 7: authentication with the remote server failed
        Code 2 Subcode 7
...
012 (26058.000.000) 12/28 14:35:13 Job was held.
        Globus error 7: authentication with the remote server failed
        Code 2 Subcode 7
...

-------------------------------------------------------------

I have made sure that there are no condor processes running
on the "remote server" which in this case is the same as the submit machine. It appears that on the condor_rm of the globus universe job
it tries to contact the remote server to kill the job, but can't do
so because the proxy has obviously expired long ago.

Any idea how to get rid of such a job?  Output of condor_q -long
is below for one of them.

Steve Timm


-- Submitter: fngp-osg.fnal.gov : <131.225.167.42:34784> : fngp-osg.fnal.gov
MyType = "Job"
TargetType = "Machine"
ClusterId = 26058
QDate = 1135177877
CompletionDate = 0
Owner = "yoo"
LocalUserCpu = 0.000000
LocalSysCpu = 0.000000
RemoteUserCpu = 0.000000
RemoteSysCpu = 0.000000
ExitStatus = 0
NumCkpts = 0
NumRestarts = 0
CommittedTime = 0
TotalSuspensions = 0
LastSuspensionTime = 0
CumulativeSuspensionTime = 0
ExitBySignal = FALSE
CondorVersion = "$CondorVersion: 6.7.12 Sep 24 2005 $"
CondorPlatform = "$CondorPlatform: I386-LINUX_RH9 $"
RootDir = "/"
Iwd = "/home/yoo/project/cdms/DarkPipe/DP_bin_10.16"
JobUniverse = 9
Cmd = "/home/yoo/project/cdms/DarkPipe/DP_bin_10.16/runpipeclean.sh"
MinHosts = 1
MaxHosts = 1
CurrentHosts = 0
WantRemoteSyscalls = FALSE
WantCheckpoint = FALSE
RemoteSpoolDir = "/local/stage1/condor/spool/cluster26058.proc0.subproc0"
x509userproxysubject = "/DC=org/DC=doegrids/OU=People/CN=Jonghee Yoo 223786"
x509userproxy = "/tmp/x509up_u11998"
JobPrio = 0
User = "yoo@xxxxxxxx"
NiceUser = FALSE
Env = ""
JobNotification = 0
WantRemoteIO = TRUE
UserLog = "/home/yoo/project/cdms/DarkPipe/DP_bin_10.16/grid_cdms/grid_cdms.log.26058.0"
CoreSize = 0
KillSig = "SIGTERM"
Rank = (Mips)
In = "/dev/null"
TransferIn = FALSE
Out = "grid_cdms/grid_cdms.out.26058.0"
StreamOut = TRUE
Err = "grid_cdms/grid_cdms.err.26058.0"
StreamErr = TRUE
BufferSize = 524288
BufferBlockSize = 32768
ShouldTransferFiles = "NO"
TransferFiles = "NEVER"
ImageSize = 1
ExecutableSize = 1
DiskUsage = 1
Requirements = TRUE
FileSystemDomain = "fnal.gov"
PeriodicHold = FALSE
PeriodicRelease = FALSE
PeriodicRemove = FALSE
OnExitHold = FALSE
OnExitRemove = TRUE
LeaveJobInQueue = FALSE
Args = "140726_1601"
GridResource = "gt2 fngp-osg.fnal.gov/jobmanager-condor"
GlobusResubmit = FALSE
WantClaiming = FALSE
GlobusRSL = "(jobtype=single)(maxwalltime=999)"
GlobalJobId = "fngp-osg.fnal.gov#1135177877#26058.0"
ProcId = 0
GlobusGramVersion = 3
NumGlobusSubmits = 1
GridJobId = "gt2 fngp-osg.fnal.gov/jobmanager-condor https://fngp-osg.fnal.gov:49645/23112/1135177881/";
GlobusStatus = 0
RemoteWallClockTime = 349250.000000
WallClockCheckpoint = UNDEFINED
ShadowBday = 0
RemoveReason = "via condor_rm (by user root)"
JobStatusOnRelease = 3
JobStatus = 5
EnteredCurrentStatus = 1135802113
HoldReason = "Globus error 7: authentication with the remote server failed"
HoldReasonCode = 2
HoldReasonSubCode = 7
ReleaseReason = UNDEFINED
NumSystemHolds = 3
Managed = "Schedd"
ServerTime = 1135802274




--
------------------------------------------------------------------
Steven C. Timm, Ph.D  (630) 840-8525  timm@xxxxxxxx  http://home.fnal.gov/~timm/
Fermilab Computing Div/Core Support Services Dept./Scientific Computing Section
Assistant Group Leader, Farms and Clustered Systems Group
Lead of Computing Farms Team