[HTCondor-users] not starting jobs in condor ver 6.3.6

Mailing List Archives Public Access	UW Madison Computer Sciences Department Computer Systems Lab

[cosy11@oswrk118 nice-simple]$ condor_status

Name OpSys Arch State Activity LoadAv Mem ActvtyTime

slot1@xxxxxxxxxxxx LINUX X86_64 Claimed Idle 1.160 2421 0+00:00:04

slot2@xxxxxxxxxxxx LINUX X86_64 Claimed Idle 0.480 2421 0+00:00:05

slot3@xxxxxxxxxxxx LINUX X86_64 Claimed Idle 1.110 2421 0+00:00:06

slot4@xxxxxxxxxxxx LINUX X86_64 Claimed Idle 0.050 2421 0+00:00:07

slot5@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 1.000 2421 0+00:00:03

slot6@xxxxxxxxxxxx LINUX X86_64 Claimed Idle 1.000 2421 0+00:00:09

slot1@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.130 1815 0+00:00:04

slot2@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.160 1815 0+00:00:02

slot3@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.160 1815 0+00:00:04

slot4@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.080 1815 0+00:00:05

slot5@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.160 1815 0+00:00:05

slot6@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.030 1815 0+00:00:09

slot7@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.080 1815 0+00:00:08

slot8@xxxxxxxxxxxx LINUX X86_64 Claimed Busy 0.020 1815 0+00:00:03

Machines Owner Claimed Unclaimed Matched Preempting

X86_64/LINUX 14 0 14 0 0 0

Total 14 0 14 0 0 0

D)======== NOT working IP=121 , just the head node with 6 jobs slots

[cosy11@oswrk121 nice-simple]$ uname -a

Linux oswrk121.lns.mit.edu 2.6.32-504.23.4.el6.x86_64 #1 SMP Tue Jun 9 11:55:03 CDT 2015 x86_64 x86_64 x86_64 GNU/Linux

[cosy11@oswrk121 nice-simple]$ condor_version

$CondorVersion: 8.3.6 Jun 21 2015 BuildID: 325064 $

$CondorPlatform: X86_64-RedHat_6.6 $

[cosy11@oswrk121 nice-simple]$ condor_q


-- Submitter: oswrk121.lns.mit.edu : <198.125.163.121:9601?addrs=10.200.60.19-9601&noUDP> : oswrk121.lns.mit.edu
 ID      OWNER            SUBMITTED     RUN_TIME ST PRI SIZE CMD               
   1.0   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.1   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.2   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.3   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.4   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.5   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.6   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.7   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.8   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.9   cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.10  cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o
   1.11  cosy11          7/2  10:45   0+00:00:00 I  0   0.0  oneA_job.sh A222,o

12 jobs; 0 completed, 0 removed, 12 idle, 0 running, 0 held, 0 suspended
[cosy11@oswrk121 nice-simple]$ condor_status
Name               OpSys      Arch   State     Activity LoadAv Mem   ActvtyTime

slot1@xxxxxxxxxxxx LINUX      X86_64 Unclaimed Idle      0.020 2421  0+00:16:08
slot2@xxxxxxxxxxxx LINUX      X86_64 Unclaimed Idle      0.000 2421  0+00:16:35
slot3@xxxxxxxxxxxx LINUX      X86_64 Unclaimed Idle      0.000 2421  0+00:16:36
slot4@xxxxxxxxxxxx LINUX      X86_64 Unclaimed Idle      0.000 2421  0+00:16:37
slot5@xxxxxxxxxxxx LINUX      X86_64 Unclaimed Idle      0.000 2421  0+00:16:38
slot6@xxxxxxxxxxxx LINUX      X86_64 Unclaimed Idle      0.000 2421  0+00:16:39
                     Machines Owner Claimed Unclaimed Matched Preempting

        X86_64/LINUX        6     0       0         6       0          0

               Total        6     0       0         6       0          0

E)==========  here is my current  condor_config.local for the not working IP=121. This condor config is script-generated,
identical as for the working IP=118, except the values of some IPs are modified accordingly. 
Both working and not working condor masters run under the same OpenStack controller and have the same ports via  iptables opened

[root@oswrk121]#  curl -f http://169.254.169.254/latest/meta-data/public-ipv4
198.125.163.121 

[root@oswrk121]#  curl -f http://169.254.169.254/latest/meta-data/local-ipv4
10.200.60.19

[root@oswrk121 condor]# service iptables status
Table: filter
Chain INPUT (policy ACCEPT)
num  target     prot opt source               destination         
1    ACCEPT     udp  --  10.200.60.0/24       0.0.0.0/0           state NEW,ESTABLISHED udp dpts:9600:10600 
2    ACCEPT     tcp  --  10.200.60.0/24       0.0.0.0/0           state NEW,ESTABLISHED tcp dpts:9600:10600 
3    ACCEPT     tcp  --  198.125.163.0/24     0.0.0.0/0           state NEW,ESTABLISHED tcp dpts:9600:10600 
4    ACCEPT     udp  --  198.125.163.0/24     0.0.0.0/0           state NEW,ESTABLISHED udp dpts:9600:10600 
5    ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0           state RELATED,ESTABLISHED 
6    ACCEPT     icmp --  0.0.0.0/0            0.0.0.0/0           
7    ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0           
8    ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0           state NEW tcp dpt:22 
9    REJECT     all  --  0.0.0.0/0            0.0.0.0/0           reject-with icmp-host-prohibited 

Chain FORWARD (policy ACCEPT)
num  target     prot opt source               destination         
1    REJECT     all  --  0.0.0.0/0            0.0.0.0/0           reject-with icmp-host-prohibited 

Chain OUTPUT (policy ACCEPT)
num  target     prot opt source               destination         

[root@oswrk121 condor]# ifconfig 
eth0      Link encap:Ethernet  HWaddr FA:16:3E:62:6E:1C  
          inet addr:10.200.60.19  Bcast:10.200.60.255  Mask:255.255.255.0
          inet6 addr: fe80::f816:3eff:fe62:6e1c/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:6900 errors:0 dropped:0 overruns:0 frame:0
          TX packets:12353 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:1899504 (1.8 MiB)  TX bytes:3054215 (2.9 MiB)

lo        Link encap:Local Loopback  
          inet addr:127.0.0.1  Mask:255.0.0.0
          inet6 addr: ::1/128 Scope:Host
          UP LOOPBACK RUNNING  MTU:65536  Metric:1
          RX packets:42410 errors:0 dropped:0 overruns:0 frame:0
          TX packets:42410 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:1260991166 (1.1 GiB)  TX bytes:1260991166 (1.1 GiB)


[root@oswrk121 ]# cat /etc/condor/condor_config.local
# source: https://sites.google.com/site/patrickbwarren/ec2-condor-cluster
# modified by Jan Balewski, MIT
CONDOR_HOST = $(FULL_HOSTNAME)
COLLECTOR_NAME = VM condor master on $(FULL_HOSTNAME)
###############################################################################
# Pool settings
###############################################################################
# EC2 workers don't have shared filesystems or authentication
UID_DOMAIN = lns.mit.edu
FILESYSTEM_DOMAIN = $(FULL_HOSTNAME)
USE_NFS = False
USE_AFS = False
USE_CKPT_SERVER = False
# The same for all machines with the same condor user
CONDOR_IDS = 496.492
###############################################################################
#  trick to force condor to use public IP
###############################################################################
# to check what IP condor uses execute:
#    condor_status -format "%s, " Name -format "%s\n" MyAddress
# to check what public IP VM uses execute:
#    curl -f http://169.254.169.254/latest/meta-data/public-ipv4
# see more details in this post:
#  http://www.isgtw.org/feed-item/spinning-getting-started-condor-and-ec2-ec2-execute-node
# below use IP of the this node
TCP_FORWARDING_HOST = 198.125.163.121
###############################################################################
# Security settings
###############################################################################
# Allow local host and the central manager to manage the node
ALLOW_ADMINISTRATOR = $(FULL_HOSTNAME), $(CONDOR_HOST)
# master needs this two particular versions
ALLOW_READ = *.lns.mit.edu,10.200.60.*
ALLOW_WRITE = *.lns.mit.edu,10.200.60.*
###############################################################################
# CPU usage settings
###############################################################################
# Don't count a hyperthreaded CPU as multiple CPUs
COUNT_HYPERTHREAD_CPUS = False
# Leave this commented out. If your instance has more than one CPU (i.e. if
# you use a large instance or something) then condor will advertise one
# slot for each CPU.
# $(DETECTED_CORES) The number of detected CPU cores. This includes hyper threaded cores, if there are any.
# for master reduce # of jobs to N-2
NUM_CPUS = $(DETECTED_CORES)-2
###############################################################################
# Daemon settings
###############################################################################
# Full list on the host node
DAEMON_LIST = COLLECTOR, MASTER, NEGOTIATOR, SCHEDD, STARTD
# Don't run java
JAVA = 
###############################################################################
# Classads
###############################################################################
# Run everything, all the time
START = True
SUSPEND = False
CONTINUE = True
PREEMPT = False
WANT_VACATE = False
WANT_SUSPEND = True
SUSPEND_VANILLA = False
WANT_SUSPEND_VANILLA = True
KILL = False
STARTD_EXPRS = START
###############################################################################
# Network settings
###############################################################################
# Use random numbers here so the workers don't all hit the collector at 
# the same time. If there are many workers the collector can get overwhelmed.
UPDATE_INTERVAL = $RANDOM_INTEGER(230, 370)
MASTER_UPDATE_INTERVAL = $RANDOM_INTEGER(230, 370)
# Port range for Jan's VM-condor cluster at LNS 
LOWPORT=9600
HIGHPORT=10600

Mailing List Archives

Public Access

[HTCondor-users] not starting jobs in condor ver 6.3.6