[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [HTCondor-users] Condor held jobs should retry/release after certain configured timeout automatically



Sure. Attached classads.txt file.



On Tue, Apr 7, 2015 at 8:30 PM, Brian Bockelman <bbockelm@xxxxxxxxxxx> wrote:

On Apr 7, 2015, at 9:42 AM, Sridhar Thumma <deadman.den@xxxxxxxxx> wrote:

Hi,

Please see my comments inline:

On Tue, Apr 7, 2015 at 7:55 PM, Brian BockelmanÂ<bbockelm@xxxxxxxxxxx>wrote:
Hi Sridhar,

The configuration seems reasonable. However, weâd need more context to know if itâs working as expected.

1) Did you run condor_reconfig after changing the configuration?
I restarted condor using condor_restart. This should refresh config values, right?

Yup, that should be fine.

Â
2) Can you give an example classad of a job you think should be released under this policy?
I submitted a grid job where AMI ID is not valid. If AMI ID is not valid, job will go into held state. In this case, it should retry for configured no of times. make sense?Â

I actually want to useÂSYSTEM_PERIODIC_RELEASEÂto release jobs which are going held state because of service unavailable error from Amazon. Using above test to valid my configuration as it is not possible to testÂservice unavailable error condition now.



Yes - I understood this part. However, to understand why itâs not doing what you think it should, weâd need to actually see the classad.

Brian


_______________________________________________
HTCondor-users mailing list
To unsubscribe, send a message to htcondor-users-request@xxxxxxxxxxx with a
subject: Unsubscribe
You can also unsubscribe by visiting
https://lists.cs.wisc.edu/mailman/listinfo/htcondor-users

The archives can be found at:
https://lists.cs.wisc.edu/archive/htcondor-users/


-- Submitter: cgw_dev@xxxxxxxxxxxxxxxxxxxxxx : <172.16.130.74:35003> : iad-dev-htc-a1.pdx.aws
MaxHosts = 1
Managed = "Schedd"
User = "cgw_dev@xxxxxxxxxxxxxxxxxxxxxx"
OnExitHold = false
CoreSize = 0
LastRemoteStatusUpdate = 1428418906
WantRemoteSyscalls = false
MyType = "Job"
Rank = 500 - TotalLoadAvg
CumulativeSuspensionTime = 0
ReleaseReason = undefined
MinHosts = 1
PeriodicHold = false
PeriodicRemove = false
Err = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/gps-ccgs-blat-test-1/logs/gene_names.err"
Submission = "cgw_dev@xxxxxxxxxxxxxxxxxxxxxx#2868"
ProcId = 0
EnteredCurrentStatus = 1428418912
UserLog = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/gps-ccgs-blat-test-1/logs/gene_names.log"
HoldReasonSubCode = 0
NumJobStarts = 0
JobUniverse = 9
In = "/dev/null"
Requirements = true
EC2VpcSubnet = "subnet-503cb427"
ClusterId = 2868
WhenToTransferOutput = "ON_EXIT"
CompletionDate = 0
EC2AmiID = "ami-c4f2d0ac"
BufferSize = 524288
Environment = "LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01;31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36: _=/usr/bin/condor_submit QTINC=/usr/lib64/qt-3.3/include CVS_RSH=ssh QTLIB=/usr/lib64/qt-3.3/lib HISTCONTROL=ignoredups PWD=/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/logs SHLVL=1 LANG=en_US.UTF-8 TERM=cygwin MAIL=/var/spool/mail/cgw_dev OLDPWD=/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436 LESSOPEN=||/usr/bin/lesspipe.sh' '%s SSH_ASKPASS=/usr/libexec/openssh/gnome-ssh-askpass G_BROKEN_FILENAMES=1 QTDIR=/usr/lib64/qt-3.3 SHELL=/bin/bash USER=cgw_dev PATH=/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/cgw_dev/bin HISTSIZE=1000 LOGNAME=cgw_dev HOSTNAME=iad-dev-htc-a1.pdx.aws HOME=/home/cgw_dev"
EC2TagName = "IAD-DEV-GET_GENE_NAMES_SCRRIPT"
TargetType = "Machine"
LeaveJobInQueue = false
JobNotification = 1
Owner = "cgw_dev"
CondorPlatform = "$CondorPlatform: X86_64-ScientificLinux_6.6 $"
CommittedTime = 0
QDate = 1428418903
TransferIn = false
ExitStatus = 0
NumCkpts_RAW = 0
HoldReason = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<Response><Errors><Error><Code>InvalidAMIID.NotFound</Code><Message>The image id 'ami-c4f2d0ac' does not exist</Message></Error></Errors><RequestID>eec8cbed-966f-4c4e-b2a6-dbf95310a56f</RequestID></Response>"
RootDir = "/"
CurrentHosts = 0
GlobalJobId = "cgw_dev@xxxxxxxxxxxxxxxxxxxxxx#2868.0#1428418903"
RemoteSysCpu = 0.0
TotalSuspensions = 0
WantCheckpoint = false
TransferExecutable = false
PeriodicRelease = false
CondorVersion = "$CondorVersion: 7.8.10 Jan 19 2015 BuildID: RH-7.8.10-0.2.el6 $"
Out = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/gps-ccgs-blat-test-1/logs/gene_names.out"
ShouldTransferFiles = "IF_NEEDED"
DiskUsage = 0
CumulativeSlotTime = 0
EC2SecurityGroups = "sg-433ce527"
CommittedSlotTime = 0
LocalUserCpu = 0.0
NotifyUser = "sridhar.thumma@xxxxxxxxxxxxxxxxxxxxxx"
DiskUsage_RAW = 0
ExitBySignal = false
StreamErr = false
HoldReasonCode = 0
NumSystemHolds = 1
NumRestarts = 0
RequestDisk = DiskUsage
GridJobId = "ec2 http://ec2.us-east-1.amazonaws.com 7d7576f3-b90e-4e00-a535-90f4d91f6ad0"
FileSystemDomain = "iad-dev-htc-a1.pdx.aws"
JobPrio = 5
EC2UserDataFile = "/usr/local/mrgstorage/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/logs/gene_names_job_primary_userdata.sh"
NumCkpts = 0
BufferBlockSize = 32768
ImageSize = 0
CommittedSuspensionTime = 0
ExecutableSize_RAW = 0
Cmd = "IAD-DEV-GET_GENE_NAMES_SCRRIPT"
WantClaiming = false
LocalSysCpu = 0.0
Iwd = "/mnt/gfs/files/deployment_package/gpsdev/seqdata/ALL_GPS_Cases/gps-ccgs-blat-test-1_1436/7ed35721-3050-4bbb-b98d-cfe60a70b6ae/logs"
GridResource = "ec2 http://ec2.us-east-1.amazonaws.com";
ServerTime = 1428419214
EC2InstanceType = "m3.medium"
ImageSize_RAW = 0
LastSuspensionTime = 0
JobStatus = 5
ExecutableSize = 0
RemoteWallClockTime = 0.0
OnExitRemove = true
Arguments = ""
KillSig = "SIGTERM"
StreamOut = false
CurrentTime = time()
RequestMemory = ifthenelse(MemoryUsage =!= undefined,MemoryUsage,( ImageSize + 1023 ) / 1024)
RemoteUserCpu = 0.0
NiceUser = false
RequestCpus = 1
EC2TagNames = "Name"
WantRemoteIO = true
LastJobStatus = 1