[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[HTCondor-users] Condor EC2 grid jobs availability zone issue



Hi,

I am submitting 100 grid jobs at once. Due to few limits in EC2 API, I am getting RequestLimitExceed error. I am taking care of this by scheduling a batch.Â

I am also getting below error which I am not able to think of a solution for it.

"We currently do not have sufficient m3.2xlarge capacity in the Availability Zone you requested (us-east-1b). Our system will be working on provisioning additional capacity. You can currently get m3.2xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1c, us-east-1e, us-east-1a."

While I was investigating on this. I found an article which says,Â"Donât specify an Availability Zone in your request unless necessary.". So, I updated my condor files to use generic url instead of region specific.Â

I changed grid_resource fromÂ
grid_resource  = Âec2 http://ec2.us-east-1.amazonaws.com
toÂ
grid_resource  = Âec2Âhttp://ec2.amazonaws.com

But, still it is trying to submit us-east-1 region only. I can see that in AWS console.Â

Any suggestions on this issue? Am I doing anything wrong here?Â

Attached classads.
MaxHosts = 1
Managed = "External"
User = "cgw_test"
OnExitHold = false
CoreSize = 0
LastRemoteStatusUpdate = 1428647871
WantRemoteSyscalls = false
MyType = "Job"
Rank = 500 - TotalLoadAvg
CumulativeSuspensionTime = 0
MinHosts = 1
PeriodicHold = false
PeriodicRemove = false
Err = "userlog.err"
Submission = "cgw_test"
ProcId = 0
EnteredCurrentStatus = 1428647871
UserLog = "userlog.log"
NumJobStarts = 1
JobUniverse = 9
In = "/dev/null"
GridJobStatus = "running"
Requirements = true
EC2VpcSubnet = "subnet-503cb427"
ClusterId = 6165
WhenToTransferOutput = "ON_EXIT"
CompletionDate = 0
EC2AmiID = "ami-f8ccf890"
BufferSize = 524288
Environment = "LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01;31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36: _=/usr/bin/condor_submit QTINC=/usr/lib64/qt-3.3/include CVS_RSH=ssh QTLIB=/usr/lib64/qt-3.3/lib HISTCONTROL=ignoredups PWD=/usr/local/mrgstorage/ce453370-b15b-4457-8695-4561b757ec2d/logs SHLVL=1 LANG=en_US.UTF-8 TERM=cygwin MAIL=/var/spool/mail/cgw_test LESSOPEN=||/usr/bin/lesspipe.sh' '%s OLDPWD=/usr/local/mrgstorage/ce453370-b15b-4457-8695-4561b757ec2d SSH_ASKPASS=/usr/libexec/openssh/gnome-ssh-askpass G_BROKEN_FILENAMES=1 QTDIR=/usr/lib64/qt-3.3 SHELL=/bin/bash USER=cgw_test PATH=/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/cgw_test/bin HISTSIZE=1000 LOGNAME=cgw_test HOSTNAME=iad-tst-htc-a1.pdx.aws HOME=/home/cgw_test"
EC2TagName = "IAD-TEST-ZONE"
TargetType = "Machine"
LeaveJobInQueue = false
JobNotification = 2
Owner = "cgw_test"
CondorPlatform = "$CondorPlatform: X86_64-ScientificLinux_6.6 $"
CommittedTime = 0
QDate = 1428647827
TransferIn = false
ExitStatus = 0
EC2SecretAccessKey = "SecretAccessKey"
NumCkpts_RAW = 0
RootDir = "/"
CurrentHosts = 0
GlobalJobId = "cgw_test@xxxxxxxxxx#6165.0#1428647827"
RemoteSysCpu = 0.0
TotalSuspensions = 0
WantCheckpoint = false
TransferExecutable = false
PeriodicRelease = false
CondorVersion = "$CondorVersion: 7.8.10 Jan 19 2015 BuildID: RH-7.8.10-0.2.el6 $"
Out = "userlog.out"
ShouldTransferFiles = "IF_NEEDED"
DiskUsage = 0
CumulativeSlotTime = 0
EC2SecurityGroups = "sg-433ce527"
CommittedSlotTime = 0
LocalUserCpu = 0.0
DiskUsage_RAW = 0
ExitBySignal = false
StreamErr = false
NumSystemHolds = 0
NumRestarts = 0
RequestDisk = DiskUsage
GridJobId = "ec2 http://ec2.amazonaws.com 0ab1db76-ec70-4217-9c14-54a9cc86c9e8 i-c2d2723f"
FileSystemDomain = "iad-tst-htc-a1.pdx.aws"
JobPrio = 5
EC2UserDataFile = "job_userdata.sh"
NumCkpts = 0
BufferBlockSize = 32768
EC2RemoteVirtualMachineName = "NULL"
ImageSize = 0
CommittedSuspensionTime = 0
ExecutableSize_RAW = 0
Cmd = "IAD-TEST-ZONE"
WantClaiming = false
LocalSysCpu = 0.0
Iwd = "/mnt/gfs/files/deployment_package/ce453370-b15b-4457-8695-4561b757ec2d/logs"
GridResource = "ec2 http://ec2.amazonaws.com";
ServerTime = 1428647879
EC2InstanceType = "m3.medium"
ImageSize_RAW = 0
LastSuspensionTime = 0
JobStatus = 2
ExecutableSize = 0
ShadowBday = 1428647871
RemoteWallClockTime = 0.0
OnExitRemove = true
EC2InstanceName = "i-c2d2723f"
Arguments = ""
KillSig = "SIGTERM"
EC2AccessKeyId = "AccessKeyID"
StreamOut = false
CurrentTime = time()
RequestMemory = ifthenelse(MemoryUsage =!= undefined,MemoryUsage,( ImageSize + 1023 ) / 1024)
RemoteUserCpu = 0.0
NiceUser = false
RequestCpus = 1
EC2TagNames = "Name"
WantRemoteIO = true
LastJobStatus = 1