[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[HTCondor-users] Need help in setting up condor parallel universe



HI All,

I am having 3 servers  and I am trying to set up
condor-mpi in our cluster but unable to run our jobs using
parallel universe in condor.I have followed all the steps which is
given in HTCONDOR tutorial page.
I need help to resolve this issue. 

HTCondor Version is CondorVersion: 8.4.4
OS:Scientific Linux 7.2
openmpi version is 1.6.5

I have enclosed my files in .txt format for your reference.
 These are the problems I am facing:

 1)contact file is not getting created automatically.
 2) If I specify requirements and request_cpu arguments in condor
 submit file the job goes into an idle state.
 3) If I don't give requirements and request_cpu arguments in condor
 submit file, contact file is not getting created and showing output as
 empty (job is running) .

 I am new to this condor how can I make my cluster to work with condor-
 mpi.
 Can you describe and give me an example scripts, that will be great
 helpful for me to work around.

 Since it is very important kindly do the needful.

 Regards,
 Malathi.
DAEMON_LIST = MASTER, STARTD, SCHEDD, COLLECTOR, NEGOTIATOR 

#the following section is for MPI to work 

OPENMPI_INSTALL_PATH = /soft/condor_mpi/
OPENMPI_EXCLUDE_NETWORK_INTERFACES = docker0,virbr0
DedicatedScheduler = "DedicatedScheduler@gpu001"

SUSPEND         = Scheduler =!= $(DedicatedScheduler) && ($(SUSPEND))
PREEMPT         = Scheduler =!= $(DedicatedScheduler) && ($(PREEMPT))
START           = (Scheduler =?= $(DedicatedScheduler)) || ($(START))
RANK_FACTOR     = 1000000
RANK            = (Scheduler =?= $(DedicatedScheduler) * $(RANK_FACTOR)) + $(RANK)

MPI_CONDOR_RSH_PATH = $(LIBEXEC)
CONDOR_SSHD = /usr/sbin/sshd
CONDOR_SSH_KEYGEN = /usr/bin/ssh-keygen
STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler

WANT_SUSPEND    = False
WANT_VACATE     = False
START           = True
SUSPEND         = False
CONTINUE        = True
PREEMPT         = False
KILL            = False



DAEMON_LIST = MASTER, STARTD, SCHEDD 

#the following section is for MPI to work 



#This is for dedicated scheduler

DedicatedScheduler = "DedicatedScheduler@gpu001"

START           = True
SUSPEND = False
CONTINUE        = True
PREEMPT = False
KILL            = False
WANT_SUSPEND    = False
WANT_VACATE     = False
RANK            = Scheduler =?= $(DedicatedScheduler)

MPI_CONDOR_RSH_PATH = /soft/condor_mpi/bin/condor_ssh
CONDOR_SSHD = /usr/sbin/sshd
CONDOR_SSH_KEYGEN = /usr/bin/ssh-keygen
STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler


##  Where have you installed the bin, sbin and lib condor directories?   
RELEASE_DIR = /usr
##  Where is the local condor directory for each host?  This is where the local config file(s), logs and
##  spool/execute directories are located. this is the default for Linux and Unix systems.
LOCAL_DIR = /var
##  Where is the machine-specific local config file for each host?
LOCAL_CONFIG_FILE = /etc/condor/condor_config.local
##  If the local config file is not present, is it an error? (WARNING: This is a potential security issue.)
REQUIRE_LOCAL_CONFIG_FILE = false
##  Use a host-based security policy. By default CONDOR_HOST and the local machine will be allowed
use SECURITY : HOST_BASED
##  To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts

## This was added by Malathi for testing CONDOR_MPI

CONDOR_HOST = 10.2.9.101
#CONDOR_HOST = 10.2.9.106
NETWORK_INTERFACE       = 10.2.9.*
ALLOW_READ              = 10.1.9.*, 10.2.9.*
ALLOW_WRITE             = 10.1.9.*, 10.2.9.*

#The following setting decides that condor will be run by root user
CONDOR_IDS=0.0
START = TRUE
SUSPEND = FALSE
USE_NFS=FALSE
CREATE_LOCKS_ON_LOCAL_DISK= TRUE
ENABLE_USERLOG_LOCKING= TRUE
IGNORE_NFS_LOCK_ERRORS = True
DAGMAN_LOG_ON_NFS_IS_ERROR = FALSE
DAGMAN_USE_STRICT=0
DAGMAN_ABORT_DUPLICATES = True
PREEMPT = FALSE
KILL = FALSE

##--------------------------------------------------------------------
##  Pathnames
RUN     = $(LOCAL_DIR)/run/condor
LOG     = $(LOCAL_DIR)/log/condor
LOCK    = $(LOCAL_DIR)/lock/condor
SPOOL   = $(LOCAL_DIR)/lib/condor/spool
EXECUTE = $(LOCAL_DIR)/lib/condor/execute


BIN     = $(RELEASE_DIR)/bin
LIB     = $(RELEASE_DIR)/lib64/condor
INCLUDE = $(RELEASE_DIR)/include/condor
SBIN    = $(RELEASE_DIR)/sbin
LIBEXEC = $(RELEASE_DIR)/libexec/condor
SHARE   = $(RELEASE_DIR)/share/condor

PROCD_ADDRESS = $(RUN)/procd_pipe

JAVA_CLASSPATH_DEFAULT = $(SHARE) $(SHARE)/scimark2lib.jar .

# The following must match on all the mechaines !

UID_DOMAIN              = IUCAA
TRUST_UID_DOMAIN        = True

##  Allow admins ONLY on LAN
ALLOW_ADMINISTRATOR     = $(CONDOR_HOST)
ALLOW_OWNER             = $(FULL_HOSTNAME), $(ALLOW_ADMINISTRATOR)

##  This macro determines what daemons the condor_master will start and keep its watchful eyes on.
##  The list is a comma or space separated list of subsystem names

STANDARD        = 1
VANILLA         = 5
MPI             = 8
VM              = 13
IsMPI           = (TARGET.JobUniverse == $(MPI))
IsVanilla       = (TARGET.JobUniverse == $(VANILLA))
IsStandard      = (TARGET.JobUniverse == $(STANDARD))
IsVM            = (TARGET.JobUniverse == $(VM))


COLLECTOR_NAME = $(CONDOR_HOST)
ALLOW_READ = *
ALLOW_WRITE = $(ALLOW_WRITE), $(CONDOR_HOST) 
ALLOW_NEGOTIATOR = $(CONDOR_HOST), $(IP_ADDRESS)



#!/bin/bash 

##**************************************************************
##
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
## 
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
## 
##    http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************

# condor_ssh wrapper
# format of contact file is
# node hostname port cwd username
# This script assumes the existance of a contact file
# and uses it to map a hostname into 
# the correct hostname/port of a listening sshd
# Uncomment the following line to have the shell print out
# each command to stderr before running it, useful for
# debugging
#set -x


if [ $# -lt 2 ]
then
    echo "Usage: condor_ssh hostname command arg1 arg2"
fi

doneParsing=false
while [ $doneParsing = "false" ]
do
doneParsing=true

if [ "$1" = "-x" ]
then
    shift
    hasx="-x"
    doneParsing="false"
fi

if [ "$1" = "-l" ]
then
    shift
    shift
    doneParsing="false"
fi

if [ "$1" = "-n" ]
then
    shift
    hasn="-n"
    doneParsing="false"
fi
done

proc=$1
shift 

# The option can also appear _after_ the host
doneParsing=false
while [ $doneParsing = "false" ]
do
doneParsing=true

if [ "$1" = "-x" ]
then
    shift
    hasx="-x"
    doneParsing="false"
fi

if [ "$1" = "-l" ]
then
    shift
    shift
    doneParsing="false"
fi

if [ "$1" = "-n" ]
then
    shift
    hasn="-n"
    doneParsing="false"
fi
done

# The HTCondor environment variables aren't always passed,
# but this script should always execute from the scratch dir
if [ -z ${_CONDOR_SCRATCH_DIR+x} ]
then
    _CONDOR_SCRATCH_DIR=`/bin/pwd`
fi

contact=$_CONDOR_SCRATCH_DIR/contact

if [ ! -f $contact ]
then
    echo "error: contact file $contact can't be found"
    exit 1
fi

# Note that the spaces in the grep are significant
line=`grep "^$proc " $contact`

if [ $? -ne 0 ]
then
    echo Proc $proc is not in contact file $contact
    exit 1
fi



#proc=`echo $line | awk '{print $1}'`

host=`echo $line | awk '{print $2}'`
port=`echo $line | awk '{print $3}'`
username=`echo $line | awk '{print $4}'`
dir=`echo $line | awk '{print $5}'`
key=$_CONDOR_SCRATCH_DIR/tmp/$proc.key


# Open MPI/MPICH assumes that you always have a shared filesystem, and
# sticks the pwd in front of all relative executable pathnames
# This is no good.
# So, if any argument contains the pwd, replace it with the scratch dir

ssh_args=$@
p=`/bin/pwd`
ssh_args=`echo $ssh_args | sed s@${p}@${dir}@g`

# Now call ssh with the remaining arguments at the end.
# Set the working directory and $HOME to the scratch dir
# so that the worker processes look there for the user's executable.

/usr/bin/ssh -q $hasn -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i $key -l $username -p $port $host cd "$dir" \; export HOME="$dir" \; "$ssh_args"

## This for openmpi script to work
#
JOBNAME = test_prog
executable = openmpiscript
arguments = -np 48 test_prog
arguments =  test_prog
transfer_input_files = test_prog,condor_ssh,sshd.sh
machine_count = 6 

output = $(JOBNAME).out
error  = $(JOBNAME).err
log    = $(JOBNAME).log

queue
#!/bin/bash 
##**************************************************************
##
## Copyright (C) 1990-2010, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
## 
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
## 
##    http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************

# YOU MUST CHANGE THIS TO THE PREFIX DIR OF OPENMPI
MPDIR=/soft/condor_mpi


if `uname -m | grep "64" 1>/dev/null 2>&1` 
then 
    MPDIR=/soft/condor_mpi
fi

PATH=$MPDIR/bin/:.:$PATH
export PATH

#LD_LIBRARY_PATH=$MPDIR/lib/:.:$LD_LIBRARY_PATH
LD_LIBRARY_PATH=/soft/condor_mpi/lib:.:$LD_LIBRARY_PATH

export LD_LIBRARY_PATH


#PATH=$MPDIR/bin:$MPDIR/1.4-gcc/bin:.:$PATH
#export PATH

# This is a script to run openmpi jobs under the Condor parallel universe
# openmpi assumes that a full openmpi is pre-installed on all execute 
# machines
# A sample submit file might look like...
#

#universe = parallel
#executable = openmpiscript
#getenv=true
#arguments = actual_mpi_job arg1 arg2 arg3
#
#should_transfer_files = yes
#when_to_transfer_output = on_exit_or_evict
#
#output = o.$(NODE)
#error  = e.$(NODE)
#log    = l
#
#notification = never
#machine_count = 8
#queue

_CONDOR_PROCNO=$_CONDOR_PROCNO
_CONDOR_NPROCS=$_CONDOR_NPROCS

#CONDOR_SSH=`condor_config_val libexec`
#CONDOR_SSH=$CONDOR_SSH/condor_ssh
#CONDOR_SSH=/home/malathi.d/condor_program	
CONDOR_SSH=./condor_ssh
	
#SSHD_SH=`condor_config_val libexec`
#SSHD_SH=$SSHD_SH/sshd.sh


#SSHD_SH=/home/malathi.d/condor_program
SSHD_SH=./sshd.sh


. $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS 

# If not the head node, just sleep forever, to let the
# sshds run
if [ $_CONDOR_PROCNO -ne 0 ]
then
		wait
		#sshd_cleanup 
		exit 0
fi

EXECUTABLE=$1
shift

# the binary is copied but the executable flag is cleared.
# so the script have to take care of this
#chmod +x $EXECUTABLE

CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact
export CONDOR_CONTACT_FILE

# The second field in the contact file is the machine name
# that condor_ssh knows how to use
sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $2}' > machines

## run the actual mpijob
if `ompi_info --param all all | grep orte_rsh_agent 1>/dev/null 2>&1`
then
    mpirun -v --prefix $MPDIR --mca orte_rsh_agent $CONDOR_SSH -n $_CONDOR_NPROCS -hostfile machines $EXECUTABLE $@ 
else
    ########## For mpi versions 1.1 & 1.2 use the line below
    mpirun -v --mca plm_rsh_agent $CONDOR_SSH -n $_CONDOR_NPROCS -hostfile machines $EXECUTABLE $@ 
fi

#sshd_cleanup
#rm -f machines

exit $?

##**************************************************************
##
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
## 
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
## 
##    http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************

sshd_cleanup() {

    rm -f ${hostkey}.dsa ${hostkey}.rsa ${hostkey}.dsa.pub ${hostkey}.rsa.pub ${idkey} ${idkey}.pub $_CONDOR_SCRATCH_DIR/tmp/sshd.out $_CONDOR_SCRATCH_DIR/contact
}

trap sshd_cleanup SIGTERM

# note the sshd requires full path
SSHD=`condor_config_val CONDOR_SSHD`
KEYGEN=`condor_config_val CONDOR_SSH_KEYGEN`
CONDOR_CHIRP=`condor_config_val libexec`
CONDOR_CHIRP=$CONDOR_CHIRP/condor_chirp

if [ -z "$SSHD" -o -z "$KEYGEN" ]
then
    echo CONDOR_SSHD and/or CONDOR_SSH_KEYGEN are not configured, exiting
    exit 255
fi

PORT=4444
_CONDOR_REMOTE_SPOOL_DIR=$_CONDOR_REMOTE_SPOOL_DIR
_CONDOR_PROCNO=$1
_CONDOR_NPROCS=$2

# make a tmp dir to store keys, etc, that
# wont get transfered back

mkdir $_CONDOR_SCRATCH_DIR/tmp

# Create the host keys

hostkey=$_CONDOR_SCRATCH_DIR/tmp/hostkey
for keytype in dsa rsa
do
    rm -f ${hostkey}.${keytype} ${hostkey}.${keytype}.pub
    $KEYGEN -q -f ${hostkey}.${keytype} -t $keytype -N '' 
    _TEST=$?

    if [ $_TEST -ne 0 ]
    then
        echo ssh keygenerator $KEYGEN returned error $_TEST exiting
        exit 255
    fi
done

idkey=$_CONDOR_SCRATCH_DIR/tmp/$_CONDOR_PROCNO.key

# Create the identity key

$KEYGEN -q -f $idkey -t rsa -N '' 
_TEST=$?
if [ $_TEST -ne 0 ]
then
    echo ssh keygenerator $KEYGEN returned error $_TEST exiting
    exit 255
fi

# Send the identity keys back home
$CONDOR_CHIRP put -perm 0700 $idkey $_CONDOR_REMOTE_SPOOL_DIR/$_CONDOR_PROCNO.key
_TEST=$?

if [ $_TEST -ne 0 ]
then
    echo error $_TEST chirp putting identity keys back
    exit 255
fi

# ssh needs full paths to all of its arguments
# Start up sshd
done=0

while [ $done -eq 0 ]
do

# Try to launch sshd on this port

    $SSHD -p$PORT -oAuthorizedKeysFile=${idkey}.pub -oHostKey=${hostkey}.dsa -oHostKey=${hostkey}.rsa -De -f/dev/null -oStrictModes=no -oPidFile=/dev/null -oAcceptEnv=_CONDOR < /dev/null > $_CONDOR_SCRATCH_DIR/tmp/sshd.out 2>&1 &

    pid=$!

# Give sshd some time

    sleep 2
    if grep "Server listening" $_CONDOR_SCRATCH_DIR/tmp/sshd.out > /dev/null 2>&1
    then
        done=1
    else

# it is probably dead now
#kill -9 $pid > /dev/null 2>&1

        PORT=`expr $PORT + 1`
    fi
done

# Don't need this anymore

rm $_CONDOR_SCRATCH_DIR/tmp/sshd.out

# create contact file

hostname=`hostname`
currentDir=`pwd`
user=`whoami`
thisrun=`$CONDOR_CHIRP get_job_attr EnteredCurrentStatus`
echo "$_CONDOR_PROCNO $hostname $PORT $user $currentDir $thisrun"  |
        $CONDOR_CHIRP put -mode cwa - $_CONDOR_REMOTE_SPOOL_DIR/contact 
_TEST=$?

if [ $_TEST -ne 0 ]
then
    echo error $_TEST chirp putting contact info back to submit machine
    exit 255

fi

# On the head node, grep for the contact file and the keys
if [ $_CONDOR_PROCNO -eq 0 ]
then
    done=0
    count=0

# Need to poll the contact file until all nodes have reported in

    while [ $done -eq 0 ]
    do
        rm -f contact
        $CONDOR_CHIRP fetch $_CONDOR_REMOTE_SPOOL_DIR/contact $_CONDOR_SCRATCH_DIR/contact
        lines=`grep -c $thisrun $_CONDOR_SCRATCH_DIR/contact`
        if [ $lines -eq $_CONDOR_NPROCS ]
        then
            done=1
            node=0
            while [ $node -ne $_CONDOR_NPROCS ]
            do
                $CONDOR_CHIRP fetch $_CONDOR_REMOTE_SPOOL_DIR/$node.key $_CONDOR_SCRATCH_DIR/tmp/$node.key

# Now that we've got it, the submit side doesn't need it anymore

                $CONDOR_CHIRP remove $_CONDOR_REMOTE_SPOOL_DIR/$node.key 

                node=`expr $node + 1`
           done

            chmod 0700 $_CONDOR_SCRATCH_DIR/tmp/*.key

# Erase the contact file from the spool directory, in case
# this job is held and rescheduled  

            $CONDOR_CHIRP remove $_CONDOR_REMOTE_SPOOL_DIR/contact


        else

# Wait a second before polling again
            sleep 1
        fi

 # Timeout after polling 1200 times (about 20 minutes)

        count=`expr $count + 1`
        if [ $count -eq 1200 ]
        then
            exit 1
        fi
    done
fi



# We'll source in this file in the MPI startup scripts,
# so we can wait and sshd_cleanup over there as needed
#wait
#sshd_cleanup