[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [HTCondor-users] Condor and Ceph




On Sep 25, 2015, at 4:38 PM, Ben Cotton <ben.cotton@xxxxxxxxxxxxxxxxxx> wrote:

Steve,

It seems like that error is normally associated with files larger than
C can handle (e.g.
http://www.gnu.org/software/coreutils/faq/coreutils-faq.html#Value-too-large-for-defined-data-type).
In this case, I doubt that's what's actually happening, but I wonder
if the Ceph driver is giving a response that confuses the stat() call.
If you run:

 strace condor_submit submit.job

That might tell us where the failure is happening (if one of the
HTCondor developers don't already have an idea).


Thanks,
BC


In this case strace doesnât tell *me* anything more :)

â.

time(NULL)                              = 1443224991
recv(4, "\1\0\0\0\10", 5, 0)            = 5
recv(4, "\0\0\0\0\0\0\0\1", 8, 0)       = 8
recv(4, "\1\0\0\0008", 5, 0)            = 5
recv(4, "\0\0\0\0\0\0\0\30\0\0\0\0\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\30"..., 56, 0) = 56
recv(4, "\1\0\0\0\375", 5, 0)           = 5
recv(4, "\0\0\0\0\0\0\0\5TriedAuthentication = tr"..., 253, 0) = 253
time(NULL)                              = 1443224991
time(NULL)                              = 1443224991
geteuid32()                             = 257
send(4, "\1\0\0\0\10\0\0\0\0\0\0'\22", 13, 0) = 13
recv(4, "\1\0\0\0\10", 5, 0)            = 5
recv(4, "\0\0\0\0\0\0\f\3", 8, 0)       = 8
send(4, "\1\0\0\0\20\0\0\0\0\0\0'\23\0\0\0\0\0\0\f\3", 21, 0) = 21
recv(4, "\1\0\0\0\10", 5, 0)            = 5
recv(4, "\0\0\0\0\0\0\0\0", 8, 0)       = 8
stat64("/home/lusol/condor", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
geteuid32()                             = 257
getcwd("/home/lusol/condor", 256)       = 19
stat64("where-am-i", {st_mode=S_IFREG|0755, st_size=19, ...}) = 0
lstat64("where-am-i", {st_mode=S_IFREG|0755, st_size=19, ...}) = 0
time(NULL)                              = 1443224991
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=3519, ...}) = 0
open("/home/lusol/condor/job.log", O_RDWR|O_APPEND) = 5
fcntl64(5, F_GETFL)                     = 0x402 (flags O_RDWR|O_APPEND)
fstat64(5, {st_mode=S_IFREG|0644, st_size=1679924, ...}) = 0
mmap2(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xfffffffff7598000
_llseek(5, 0, [0], SEEK_CUR)            = 0
close(5)                                = 0
munmap(0xf7598000, 1048576)             = 0
getrlimit(RLIMIT_CORE, {rlim_cur=-4294967296, rlim_max=32649899137263210}) = 0
open("/home/lusol/condor/out.0", O_WRONLY|O_LARGEFILE) = 5
fstat64(5, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0
close(5)                                = 0
write(2, "\nERROR: Can't open \"/home/lusol/"..., 104
ERROR: Can't open "/home/lusol/condor/out.0"  with flags 01101 (Value too large for defined data type)
) = 104
exit_group(1)                           = ?


[lusol@condor condor]$ dir
total 1728
drwxr-xr-x 5 lusol bin    4096 Sep 25 19:45 .
drwx------ 7 lusol bin    8192 Sep 25 19:44 ..
drwxr-xr-x 5 lusol bin      83 Sep 25 15:02 dedicated
-rw-r--r-- 1 lusol bin   20871 Sep 25 15:02 hist.out
-rw-r--r-- 1 lusol bin 1679924 Sep 25 19:48 job.log
drwxr-xr-x 2 lusol bin    4096 Sep 25 15:02 neos-run
-rw-r--r-- 1 lusol bin       0 Sep 25 15:48 out.0
-rw-r--r-- 1 lusol bin     578 Sep 25 19:45 submit.job
-rw-r--r-- 1 lusol bin     593 Sep 25 16:00 submit.job~
drwxr-xr-x 2 lusol bin    4096 Sep 25 15:02 WAMATA
-rwxr-xr-x 1 lusol bin      19 Sep 25 15:02 where-am-i
-rw-r--r-- 1 lusol bin     249 Sep 25 15:02 win32.submit.job


[lusol@condor condor]$ df -h .
Filesystem         Size  Used Avail Use% Mounted on
nas2:/zhome/lusol   15T   12T  3.2T  80% /home/lusol
[lusol@condor condor]$