[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Condor-users] parallel MPI job runs from command line, but not in condor?



I have an MPI job that I can run from the command line using the following line
 
/opt/scali/bin/mpirun -v -np 4 -machinefile condor_machines /usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe PARALLEL4
 
this works with no problems. However when using the following submit file, the job aborts right away. I have listed the submit file and the output from using "set -x" in
the mpi wrapper shell (mpiscript).
 
Does anyone have any ideas?
 
Thanks
Jeff
 
 
universe = parallel
machine_count = 4
executable = mpiscript
case_name = PARALLEL4
run_number = 46
deck_name = $(case_name)
simulator_executable = /usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
environment = "LD_LIBRARY_PATH=/opt/scali/lib64"
arguments = $(simulator_executable) $(deck_name)
output = PARALLEL.RUNLOG
error = PARALLEL_$(Node).CONDOR.ERROR
log = PARALLEL.CONDOR.LOG
transfer_input_files = $(simulator_executable), $(case_name).DATA
should_transfer_files = YES
when_to_transfer_output = ON_EXIT_OR_EVICT
Requirements = ( LoadAvg < 0.10 ) && ( eclipse_available > 0 )
Rank = KFlops
+AccountingGroup = "REB_018925"
Queue
 
I have a "set -x" in the mpiscript wrapper and the output follows
 
[jwingard@dmvx reb]$ cat PARALLEL.CONDOR.ERROR
+ _CONDOR_PROCNO=0
+ _CONDOR_NPROCS=4
++ condor_config_val libexec
+ CONDOR_SSH=/usr/local/condor/libexec
+ CONDOR_SSH=/usr/local/condor/libexec/condor_ssh
++ condor_config_val libexec
+ SSHD_SH=/usr/local/condor/libexec
+ SSHD_SH=/usr/local/condor/libexec/sshd.sh
+ . /usr/local/condor/libexec/sshd.sh 0 4
++ trap sshd_cleanup 15
+++ condor_config_val CONDOR_SSHD
++ SSHD=/usr/sbin/sshd
+++ condor_config_val CONDOR_SSH_KEYGEN
++ KEYGEN=/usr/bin/ssh-keygen
+++ condor_config_val libexec
++ CONDOR_CHIRP=/usr/local/condor/libexec
++ CONDOR_CHIRP=/usr/local/condor/libexec/condor_chirp
++ PORT=4444
++ _CONDOR_REMOTE_SPOOL_DIR=/data/condor/spool/cluster376.proc0.subproc0
++ _CONDOR_PROCNO=0
++ _CONDOR_NPROCS=4
++ mkdir /data/condor/execute/dir_10743/tmp
++ hostkey=/data/condor/execute/dir_10743/tmp/hostkey
++ /bin/rm -f /data/condor/execute/dir_10743/tmp/hostkey /data/condor/execute/dir_10743/tmp/hostkey.pub
++ /usr/bin/ssh-keygen -q -f /data/condor/execute/dir_10743/tmp/hostkey -t rsa -N ''
++ '[' 0 -ne 0 ']'
++ idkey=/data/condor/execute/dir_10743/tmp/0.key
++ /usr/bin/ssh-keygen -q -f /data/condor/execute/dir_10743/tmp/0.key -t rsa -N ''
++ '[' 0 -ne 0 ']'
++ /usr/local/condor/libexec/condor_chirp put -perm 0700 /data/condor/execute/dir_10743/tmp/0.key /data/condor/spool/cluster376.proc0.subproc0/0.key
++ '[' 0 -ne 0 ']'
++ done=0
++ '[' 0 -eq 0 ']'
++ /usr/sbin/sshd -p4444 -oAuthorizedKeysFile=/data/condor/execute/dir_10743/tmp/0.key.pub -h/data/condor/execute/dir_10743/tmp/hostkey -De -f/dev/null -oStrictModes=no -oPidFile=/dev/null -oAcceptEnv=_CONDOR
++ pid=10766
++ sleep 2
++ grep 'Server listening' sshd.out
++ done=1
++ '[' 1 -eq 0 ']'
++ /bin/rm sshd.out
+++ hostname
++ hostname=c02.vxnet
+++ pwd
++ currentDir=/c02/condor/execute/dir_10743
+++ whoami
++ user=jwingard
++ echo '0 c02.vxnet 4444 jwingard /c02/condor/execute/dir_10743'
++ /usr/local/condor/libexec/condor_chirp put -mode cwa - /data/condor/spool/cluster376.proc0.subproc0/contact
++ '[' 0 -ne 0 ']'
++ '[' 0 -eq 0 ']'
++ done=0
++ '[' 0 -eq 0 ']'
++ /bin/rm -f contact
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/contact /data/condor/execute/dir_10743/contact
+++ wc -l /data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ lines=1
++ '[' 1 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm -f contact
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/contact /data/condor/execute/dir_10743/contact
+++ wc -l /data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ lines=3
++ '[' 3 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm -f contact
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/contact /data/condor/execute/dir_10743/contact
+++ wc -l /data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ lines=3
++ '[' 3 -eq 4 ']'
++ sleep 1
++ '[' 0 -eq 0 ']'
++ /bin/rm -f contact
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/contact /data/condor/execute/dir_10743/contact
+++ wc -l /data/condor/execute/dir_10743/contact
+++ awk '{print $1}'
++ lines=4
++ '[' 4 -eq 4 ']'
++ done=1
++ node=0
++ '[' 0 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/0.key /data/condor/execute/dir_10743/tmp/0.key
++ /usr/local/condor/libexec/condor_chirp remove /data/condor/spool/cluster376.proc0.subproc0/0.key
+++ expr 0 + 1
++ node=1
++ '[' 1 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/1.key /data/condor/execute/dir_10743/tmp/1.key
++ /usr/local/condor/libexec/condor_chirp remove /data/condor/spool/cluster376.proc0.subproc0/1.key
+++ expr 1 + 1
++ node=2
++ '[' 2 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/2.key /data/condor/execute/dir_10743/tmp/2.key
++ /usr/local/condor/libexec/condor_chirp remove /data/condor/spool/cluster376.proc0.subproc0/2.key
+++ expr 2 + 1
++ node=3
++ '[' 3 -ne 4 ']'
++ /usr/local/condor/libexec/condor_chirp fetch /data/condor/spool/cluster376.proc0.subproc0/3.key /data/condor/execute/dir_10743/tmp/3.key
++ /usr/local/condor/libexec/condor_chirp remove /data/condor/spool/cluster376.proc0.subproc0/3.key
+++ expr 3 + 1
++ node=4
++ '[' 4 -ne 4 ']'
++ chmod 0700 /data/condor/execute/dir_10743/tmp/0.key /data/condor/execute/dir_10743/tmp/1.key /data/condor/execute/dir_10743/tmp/2.key /data/condor/execute/dir_10743/tmp/3.key
++ /usr/local/condor/libexec/condor_chirp remove /data/condor/spool/cluster376.proc0.subproc0/contact
++ '[' 1 -eq 0 ']'
+ '[' 0 -ne 0 ']'
+ EXECUTABLE=/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
+ shift
+ chmod +x /usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe
chmod: changing permissions of `/usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe': Operation not permitted
+ MPDIR=/opt/scali/bin
+ PATH=/opt/scali/bin:.:/usr/local/condor/bin:/sbin:/usr/sbin:/bin:/usr/bin:/usr/X11R6/bin
+ export PATH
+ export P4_RSHCOMMAND=/usr/local/condor/libexec/condor_ssh
+ P4_RSHCOMMAND=/usr/local/condor/libexec/condor_ssh
+ CONDOR_CONTACT_FILE=/data/condor/execute/dir_10743/contact
+ export CONDOR_CONTACT_FILE
+ sort -n +0
+ awk '{print $2}'
+ mpirun -v -np 4 -machinefile condor_machines /usr/local/share/ecl/2008.1/bin/linux_x86_64/eclipse_scampi.exe PARALLEL4
*** glibc detected *** malloc(): memory corruption: 0x0000000000505700 ***
/opt/scali/bin/mpirun: line 646: 10885 Aborted                 $MPIMON $MPIMON_OPTS $_PROGRAM $_PROGOPTS -- $RUN_LIST
+ sshd_cleanup
+ /bin/rm -f /data/condor/execute/dir_10743/tmp/hostkey /data/condor/execute/dir_10743/tmp/hostkey.pub /data/condor/execute/dir_10743/tmp/0.key /data/condor/execute/dir_10743/tmp/0.key.pub sshd.out /data/condor/execute/dir_10743/contact
+ exit 0