[Gems-users] Problems on running multiple non-parallel benchmarks on CMP


Date: Sat, 22 Nov 2008 18:22:35 +0100
From: pauerola@xxxxxxxxxx
Subject: [Gems-users] Problems on running multiple non-parallel benchmarks on CMP
[ Follows thread
https://lists.cs.wisc.edu/archive/gems-users/2008-October/msg00088.shtml ]


Greetings,

We don't have a solution to the problem described previously yet, but we have
more information obtained with diferents profiles. Any guidance, idea,...
anything will be grateful.

We have launched a simple matrix bucle on cores 0, 2, 4 and 7 (with pbind) on a
8core CMP. The main assembler code is

        ld      [%fp-40],%f6
        ld      [%fp-8],%l3
        mov     %i2,%l4
        sll     %l4,10,%l2
        sll     %l4,3,%l1
        sub     %l2,%l1,%l2
        sll     %l4,4,%l1
        sub     %l2,%l1,%l6
        add     %l6,%i4,%l0
        sll     %l0,2,%l1
        ld      [%l3+%l1],%f5
        ld      [%fp-12],%l3
        sll     %i4,10,%l2
        sll     %i4,3,%l1
        sub     %l2,%l1,%l2
        sll     %i4,4,%l1
        sub     %l2,%l1,%l0
        mov     %i3,%l2
        add     %l0,%l2,%l0
        sll     %l0,2,%l1
        ld      [%l3+%l1],%f4
        fmuls   %f5,%f4,%f4
        fadds   %f6,%f4,%f4
        st      %f4,[%fp-40]
        add     %i4,10,%i4
        cmp     %i4,1000
        bl      .L189
        nop

that can be seen executed in core 2 and in core 4 with the simics tracer

**************
*** CORE 2 ***
**************
inst: [      645] CPU  2 <v:0x0000000000010c6c> <p:0x0080f590c6c> cb04c011 ld
[%l3 + %l1], %f5
data: [      137] CPU  2 <v:0x00000000003070c8> <p:0x00078bb30c8> FP Read   4
bytes  0x0
inst: [      649] CPU  2 <v:0x0000000000010c70> <p:0x0080f590c70> e607bff4 lduw
[%fp + -12], %l3
data: [      140] CPU  2 <v:0x00000000ffbffd84> <p:0x008004a7d84> Nrml Read   4
bytes  0x3f17c0
inst: [      653] CPU  2 <v:0x0000000000010c74> <p:0x0080f590c74> a52f200a sll
%i4, 10, %l2
inst: [      657] CPU  2 <v:0x0000000000010c78> <p:0x0080f590c78> a32f2003 sll
%i4, 3, %l1
inst: [      661] CPU  2 <v:0x0000000000010c7c> <p:0x0080f590c7c> a4248011 sub
%l2, %l1, %l2
inst: [      665] CPU  2 <v:0x0000000000010c80> <p:0x0080f590c80> a32f2004 sll
%i4, 4, %l1
inst: [      669] CPU  2 <v:0x0000000000010c84> <p:0x0080f590c84> a0248011 sub
%l2, %l1, %l0
inst: [      673] CPU  2 <v:0x0000000000010c88> <p:0x0080f590c88> a416c000 or
%i3, %g0, %l2
inst: [      677] CPU  2 <v:0x0000000000010c8c> <p:0x0080f590c8c> a0040012 add
%l0, %l2, %l0
inst: [      681] CPU  2 <v:0x0000000000010c90> <p:0x0080f590c90> a32c2002 sll
%l0, 2, %l1
inst: [      685] CPU  2 <v:0x0000000000010c94> <p:0x0080f590c94> c904c011 ld
[%l3 + %l1], %f4
data: [      149] CPU  2 <v:0x0000000000787a88> <p:0x0086d833a88> FP Read   4
bytes  0x0
inst: [      689] CPU  2 <v:0x0000000000010c98> <p:0x0080f590c98> 89a14924 fmuls
%f5, %f4, %f4
inst: [      693] CPU  2 <v:0x0000000000010c9c> <p:0x0080f590c9c> 89a18824 fadds
%f6, %f4, %f4
inst: [      697] CPU  2 <v:0x0000000000010ca0> <p:0x0080f590ca0> c927bfd8 st
%f4, [%fp + -40]
data: [      150] CPU  2 <v:0x00000000ffbffd68> <p:0x008004a7d68> FP Write  4
bytes  0x0
inst: [      701] CPU  2 <v:0x0000000000010ca4> <p:0x0080f590ca4> b807200a add
%i4, 10, %i4
inst: [      705] CPU  2 <v:0x0000000000010ca8> <p:0x0080f590ca8> 80a723e8 cmp
%i4, 1000
inst: [      709] CPU  2 <v:0x0000000000010cac> <p:0x0080f590cac> 06bfffe6 bl
0x10c44
inst: [      713] CPU  2 <v:0x0000000000010cb0> <p:0x0080f590cb0> 01000000 nop
inst: [      717] CPU  2 <v:0x0000000000010c44> <p:0x0080f590c44> cd07bfd8 ld
[%fp + -40], %f6
data: [      152] CPU  2 <v:0x00000000ffbffd68> <p:0x008004a7d68> FP Read   4
bytes  0x0
inst: [      721] CPU  2 <v:0x0000000000010c48> <p:0x0080f590c48> e607bff8 lduw
[%fp + -8], %l3
data: [      155] CPU  2 <v:0x00000000ffbffd88> <p:0x008004a7d88> Nrml Read   4
bytes  0x20eb8
inst: [      725] CPU  2 <v:0x0000000000010c4c> <p:0x0080f590c4c> a8168000 or
%i2, %g0, %l4
inst: [      730] CPU  2 <v:0x0000000000010c50> <p:0x0080f590c50> a52d200a sll
%l4, 10, %l2
inst: [      735] CPU  2 <v:0x0000000000010c54> <p:0x0080f590c54> a32d2003 sll
%l4, 3, %l1
inst: [      739] CPU  2 <v:0x0000000000010c58> <p:0x0080f590c58> a4248011 sub
%l2, %l1, %l2
inst: [      743] CPU  2 <v:0x0000000000010c5c> <p:0x0080f590c5c> a32d2004 sll
%l4, 4, %l1
inst: [      747] CPU  2 <v:0x0000000000010c60> <p:0x0080f590c60> ac248011 sub
%l2, %l1, %l6
inst: [      751] CPU  2 <v:0x0000000000010c64> <p:0x0080f590c64> a005801c add
%l6, %i4, %l0
inst: [      755] CPU  2 <v:0x0000000000010c68> <p:0x0080f590c68> a32c2002 sll
%l0, 2, %l1
inst: [      759] CPU  2 <v:0x0000000000010c6c> <p:0x0080f590c6c> cb04c011 ld
[%l3 + %l1], %f5
data: [      160] CPU  2 <v:0x00000000003070f0> <p:0x00078bb30f0> FP Read   4
bytes  0x0
inst: [      763] CPU  2 <v:0x0000000000010c70> <p:0x0080f590c70> e607bff4 lduw
[%fp + -12], %l3
data: [      162] CPU  2 <v:0x00000000ffbffd84> <p:0x008004a7d84> Nrml Read   4
bytes  0x3f17c0

**************
*** CORE 4 ***
**************
inst: [        2] CPU  4 <v:0x0000000000010cb0> <p:0x0007ececcb0> 01000000 nop
inst: [        4] CPU  4 <v:0x0000000000010c44> <p:0x0007ececc44> cd07bfd8 ld
[%fp + -40], %f6
data: [        1] CPU  4 <v:0x00000000ffbffd68> <p:0x0087e583d68> FP Read   4
bytes  0x0
inst: [        6] CPU  4 <v:0x0000000000010c48> <p:0x0007ececc48> e607bff8 lduw
[%fp + -8], %l3
data: [        2] CPU  4 <v:0x00000000ffbffd88> <p:0x0087e583d88> Nrml Read   4
bytes  0x20eb8
inst: [        8] CPU  4 <v:0x0000000000010c4c> <p:0x0007ececc4c> a8168000 or
%i2, %g0, %l4
inst: [       10] CPU  4 <v:0x0000000000010c50> <p:0x0007ececc50> a52d200a sll
%l4, 10, %l2
inst: [       12] CPU  4 <v:0x0000000000010c54> <p:0x0007ececc54> a32d2003 sll
%l4, 3, %l1
inst: [       14] CPU  4 <v:0x0000000000010c58> <p:0x0007ececc58> a4248011 sub
%l2, %l1, %l2
inst: [       16] CPU  4 <v:0x0000000000010c5c> <p:0x0007ececc5c> a32d2004 sll
%l4, 4, %l1
inst: [       18] CPU  4 <v:0x0000000000010c60> <p:0x0007ececc60> ac248011 sub
%l2, %l1, %l6
inst: [       20] CPU  4 <v:0x0000000000010c64> <p:0x0007ececc64> a005801c add
%l6, %i4, %l0
inst: [       22] CPU  4 <v:0x0000000000010c68> <p:0x0007ececc68> a32c2002 sll
%l0, 2, %l1
inst: [       24] CPU  4 <v:0x0000000000010c6c> <p:0x0007ececc6c> cb04c011 ld
[%l3 + %l1], %f5
data: [        4] CPU  4 <v:0x000000000029ccf0> <p:0x0087f424cf0> FP Read   4
bytes  0x0
inst: [       26] CPU  4 <v:0x0000000000010c70> <p:0x0007ececc70> e607bff4 lduw
[%fp + -12], %l3
data: [        6] CPU  4 <v:0x00000000ffbffd84> <p:0x0087e583d84> Nrml Read   4
bytes  0x3f17c0
inst: [       28] CPU  4 <v:0x0000000000010c74> <p:0x0007ececc74> a52f200a sll
%i4, 10, %l2
inst: [       30] CPU  4 <v:0x0000000000010c78> <p:0x0007ececc78> a32f2003 sll
%i4, 3, %l1
inst: [       32] CPU  4 <v:0x0000000000010c7c> <p:0x0007ececc7c> a4248011 sub
%l2, %l1, %l2
inst: [       34] CPU  4 <v:0x0000000000010c80> <p:0x0007ececc80> a32f2004 sll
%i4, 4, %l1
inst: [       36] CPU  4 <v:0x0000000000010c84> <p:0x0007ececc84> a0248011 sub
%l2, %l1, %l0
inst: [       38] CPU  4 <v:0x0000000000010c88> <p:0x0007ececc88> a416c000 or
%i3, %g0, %l2
inst: [       40] CPU  4 <v:0x0000000000010c8c> <p:0x0007ececc8c> a0040012 add
%l0, %l2, %l0
inst: [       42] CPU  4 <v:0x0000000000010c90> <p:0x0007ececc90> a32c2002 sll
%l0, 2, %l1
inst: [       44] CPU  4 <v:0x0000000000010c94> <p:0x0007ececc94> c904c011 ld
[%l3 + %l1], %f4
data: [        9] CPU  4 <v:0x0000000000484ae8> <p:0x0087f60cae8> FP Read   4
bytes  0x0
inst: [       46] CPU  4 <v:0x0000000000010c98> <p:0x0007ececc98> 89a14924 fmuls
%f5, %f4, %f4
inst: [       48] CPU  4 <v:0x0000000000010c9c> <p:0x0007ececc9c> 89a18824 fadds
%f6, %f4, %f4
inst: [       50] CPU  4 <v:0x0000000000010ca0> <p:0x0007ececca0> c927bfd8 st
%f4, [%fp + -40]
data: [       10] CPU  4 <v:0x00000000ffbffd68> <p:0x0087e583d68> FP Write  4
bytes  0x0
inst: [       52] CPU  4 <v:0x0000000000010ca4> <p:0x0007ececca4> b807200a add
%i4, 10, %i4
inst: [       54] CPU  4 <v:0x0000000000010ca8> <p:0x0007ececca8> 80a723e8 cmp
%i4, 1000
inst: [       56] CPU  4 <v:0x0000000000010cac> <p:0x0007ececcac> 06bfffe6 bl
0x10c44
inst: [       58] CPU  4 <v:0x0000000000010cb0> <p:0x0007ececcb0> 01000000 nop
inst: [       60] CPU  4 <v:0x0000000000010c44> <p:0x0007ececc44> cd07bfd8 ld
[%fp + -40], %f6
data: [       13] CPU  4 <v:0x00000000ffbffd68> <p:0x0087e583d68> FP Read   4
bytes  0x0


These executions seem correct, but when we take a look at ruby debugger we
detect that only core 2 does his work. Core 4 executes ifetches but no data
have been loaded (adresses 0x87xxxxxxx).

**************
*** CORE 2 ***
**************
    323   0   2    L1Cache      Exclusive_Data     IS>M_W    [0x78bb3040, line
0x78bb3040]
    373   0   2    L1Cache         Use_Timeout    M_W>M      [0x78bb3040, line
0x78bb3040]
    383   0   2    L1Cache                Load      I>IS     [0x78bb3080, line
0x78bb3080]
    698   0   2    L1Cache      Exclusive_Data     IS>M_W    [0x78bb3080, line
0x78bb3080]
    730   0   2    L1Cache                Load      I>IS     [0x78bb30c0, line
0x78bb30c0]
    748   0   2    L1Cache         Use_Timeout    M_W>M      [0x78bb3080, line
0x78bb3080]
   1048   0   2    L1Cache      Exclusive_Data     IS>M_W    [0x78bb30c0, line
0x78bb30c0]
   1098   0   2    L1Cache         Use_Timeout    M_W>M      [0x78bb30c0, line
0x78bb30c0]
   1108   0   2    L1Cache                Load      I>IS     [0x78bb3100, line
0x78bb3100]
   1425   0   2    L1Cache      Exclusive_Data     IS>M_W    [0x78bb3100, line
0x78bb3100]
   1457   0   2    L1Cache                Load      I>IS     [0x78bb3140, line
0x78bb3140]
   1475   0   2    L1Cache         Use_Timeout    M_W>M      [0x78bb3100, line
0x78bb3100]
   1771   0   2    L1Cache      Exclusive_Data     IS>M_W    [0x78bb3140, line
0x78bb3140]
   1821   0   2    L1Cache         Use_Timeout    M_W>M      [0x78bb3140, line
0x78bb3140]
   1831   0   2    L1Cache                Load      I>IS     [0x78bb3180, line
0x78bb3180]
   2145   0   2    L1Cache      Exclusive_Data     IS>M_W    [0x78bb3180, line
0x78bb3180]
   2170   0   2    L1Cache               Store      I>IM     [0x79357740, line
0x79357740]
   2195   0   2    L1Cache         Use_Timeout    M_W>M      [0x78bb3180, line
0x78bb3180]
   2487   0   2    L1Cache      Exclusive_Data     IM>OM     [0x79357740, line
0x79357740]
   2488   0   2    L1Cache            All_acks     OM>MM_W   [0x79357740, line
0x79357740]
   2511   0   2    L1Cache                Load      I>IS     [0x78bb2200, line
0x78bb2200]
   2538   0   2    L1Cache         Use_Timeout   MM_W>MM     [0x79357740, line
0x79357740]
   2825   0   2    L1Cache      Exclusive_Data     IS>M_W    [0x78bb2200, line
0x78bb2200]
   2857   0   2    L1Cache                Load      I>IS     [0x78bb2240, line
0x78bb2240]
   2875   0   2    L1Cache         Use_Timeout    M_W>M      [0x78bb2200, line
0x78bb2200]
   ...

**************
*** CORE 4 ***
**************
    339   0   4    L1Cache      Exclusive_Data     IS>M_W    [0x7ececc80, line
0x7ececc80]
    345   0   4    L1Cache              Ifetch      I>IS     [0x7ececc40, line
0x7ececc40]
    389   0   4    L1Cache         Use_Timeout    M_W>M      [0x7ececc80, line
0x7ececc80]
    659   0   4    L1Cache      Exclusive_Data     IS>M_W    [0x7ececc40, line
0x7ececc40]
    709   0   4    L1Cache         Use_Timeout    M_W>M      [0x7ececc40, line
0x7ececc40]
    722   0   4    L1Cache              Ifetch      I>IS     [0x7ececcc0, line
0x7ececcc0]
   1038   0   4    L1Cache      Exclusive_Data     IS>M_W    [0x7ececcc0, line
0x7ececcc0]
   1047   0   4    L1Cache              Ifetch      I>IS     [0x7ececc00, line
0x7ececc00]
   1088   0   4    L1Cache         Use_Timeout    M_W>M      [0x7ececcc0, line
0x7ececcc0]
   1364   0   4    L1Cache      Exclusive_Data     IS>M_W    [0x7ececc00, line
0x7ececc00]
   1369   0   4    L1Cache                Load      I>IS     [0x7ececd00, line
0x7ececd00]
   1414   0   4    L1Cache         Use_Timeout    M_W>M      [0x7ececc00, line
0x7ececc00]
   1683   0   4    L1Cache      Exclusive_Data     IS>M_W    [0x7ececd00, line
0x7ececd00]
   1733   0   4    L1Cache         Use_Timeout    M_W>M      [0x7ececd00, line
0x7ececd00]


Profiling Sequencer.C we see the correctness of the core 2 and that core 4
executes iteratively ifetch.

**************
*** CORE 2 ***
**************
Version 2, Address 78BB3118, Hit/Miss h
Version 2, Address 78BB3140, Hit/Miss h
Version 2, Address 78BB3168, Hit/Miss h
Version 2, Address 78BB3190, Hit/Miss h
Version 2, Address 79357A90, Hit/Miss M
Version 2, Address 78BB2218, Hit/Miss h
Version 2, Address 78BB2240, Hit/Miss h
Version 2, Address 78BB2268, Hit/Miss h
Version 2, Address 78BB2290, Hit/Miss M
Version 2, Address 78BB22B8, Hit/Miss h
Version 2, Address 78BB22E0, Hit/Miss h
Version 2, Address 78BB2308, Hit/Miss h
Version 2, Address 78BB2330, Hit/Miss h
Version 2, Address 78BB2358, Hit/Miss h
Version 2, Address 78BB2380, Hit/Miss h
Version 2, Address 78BB23A8, Hit/Miss h
Version 2, Address 78BB23D0, Hit/Miss h
Version 2, Address 78BB23F8, Hit/Miss h
Version 2, Address 78BB2420, Hit/Miss h
Version 2, Address 78BB2448, Hit/Miss h

**************
*** CORE 4 ***
**************
Version 4, Address 7ECECC44, Hit/Miss h
Version 4, Address 7ECECC48, Hit/Miss h
Version 4, Address 7ECECC4C, Hit/Miss h
Version 4, Address 7ECECC50, Hit/Miss h
Version 4, Address 7ECECC54, Hit/Miss h
Version 4, Address 7ECECC58, Hit/Miss h
Version 4, Address 7ECECC5C, Hit/Miss h
Version 4, Address 7ECECC60, Hit/Miss h
Version 4, Address 7ECECC64, Hit/Miss h
Version 4, Address 7ECECC68, Hit/Miss h
Version 4, Address 7ECECC6C, Hit/Miss h
Version 4, Address 7ECECC70, Hit/Miss h
Version 4, Address 7ECECC74, Hit/Miss h
Version 4, Address 7ECECC78, Hit/Miss h
Version 4, Address 7ECECC7C, Hit/Miss h
Version 4, Address 7ECECC80, Hit/Miss h
Version 4, Address 7ECECC84, Hit/Miss h
Version 4, Address 7ECECC88, Hit/Miss h
Version 4, Address 7ECECC8C, Hit/Miss h
Version 4, Address 7ECECC90, Hit/Miss h
Version 4, Address 7ECECC94, Hit/Miss h
Version 4, Address 7ECECC98, Hit/Miss h
Version 4, Address 7ECECC9C, Hit/Miss h
Version 4, Address 7ECECCA0, Hit/Miss h
Version 4, Address 7ECECCA4, Hit/Miss h
Version 4, Address 7ECECCA8, Hit/Miss h
Version 4, Address 7ECECCAC, Hit/Miss h
Version 4, Address 7ECECCB0, Hit/Miss h
Version 4, Address 7ECECC44, Hit/Miss h
Version 4, Address 7ECECC48, Hit/Miss h
Version 4, Address 7ECECC4C, Hit/Miss h
Version 4, Address 7ECECC50, Hit/Miss h


Seems like the core 4 load instructions but never executes the code... we are
really confused, any idea will be valuable.
If you would repeat our experiment in your environment you can use the simics
script below to create our matrix.C code, compile it (modify you compiler path
if you need) and create a checkpoint to then execute Ruby.

Many thanks,
Pau

##############
### matrix ###
##############
con0.input "\n"
c 10000000

con0.input "echo \"#include <stdlib.h>\" >> matrix.c\n"
c 10000000
con0.input "echo \"#include <stdio.h>\" >> matrix.c\n"
c 10000000

con0.input "echo \"#define N 1000 \" >> matrix.c\n"
c 10000000
con0.input "echo \"#define step 10 \" >> matrix.c\n"
c 10000000

con0.input "echo \"int main(int argc, char** argv)\" >> matrix.c\n"
c 10000000
con0.input "echo \"{\" >> matrix.c\n"
c 10000000
con0.input "echo \"   float *A, *B, *C;\" >> matrix.c\n"
c 10000000
con0.input "echo \"   register int i, j, k, w;\" >> matrix.c\n"
c 10000000
con0.input "echo \"   register float s;\" >> matrix.c\n"
c 10000000

con0.input "echo \"   A = (float*) malloc(sizeof(float)*N*N);\" >> matrix.c\n"
c 10000000
con0.input "echo \"   B = (float*) malloc(sizeof(float)*N*N);\" >> matrix.c\n"
c 10000000
con0.input "echo \"   C = (float*) malloc(sizeof(float)*N*N);\" >> matrix.c\n"
c 10000000

con0.input "echo \"    for (w=0; w<100000; w++) {\" >> matrix.c\n"
con0.input "echo \"     for (i=0; i<N; i++) {\" >> matrix.c\n"
c 10000000
con0.input "echo \"       for (j=0; j<N; j+=step) {\" >> matrix.c\n"
c 10000000
con0.input "echo \"         s = (float)0;\" >> matrix.c\n"
c 10000000
con0.input "echo \"         for (k = 0; k < N; k+=step) {\" >> matrix.c\n"
c 10000000
con0.input "echo \"            s += ( A[i*N+k] * B[k*N+j] );\" >> matrix.c\n"
c 10000000
con0.input "echo \"         }\" >> matrix.c\n"
c 10000000
con0.input "echo \"         C[i*N+j] = s;\" >> matrix.c\n"
c 10000000
con0.input "echo \"       }\" >> matrix.c\n"
c 10000000
con0.input "echo \"     }\" >> matrix.c\n"
c 10000000
con0.input "echo \"   }\" >> matrix.c\n"
c 10000000

con0.input "echo \"   return 0;\" >> matrix.c\n"
c 10000000
con0.input "echo \"}\" >> matrix.c\n"
c 10000000

### modify your compiler path
### con0.input "/opt/SUNWspro/prod/bin/cc matrix.c -o matrix.rr\n"
con0.input "cc matrix.c -o matrix.rr\n"
c 100000000

con0.input "cp matrix.rr matrixA.rr\n"
c 10000000
con0.input "cp matrix.rr matrixB.rr\n"
c 10000000
con0.input "cp matrix.rr matrixC.rr\n"
c 10000000
con0.input "cp matrix.rr matrixD.rr\n"
c 10000000

con0.input "/usr/bin/nice --50 ./matrixA.rr A &\n"
c 10000000
con0.input "PIDBIND=`pgrep matrixA.rr`\n"
c 10000000
con0.input "pbind -b 2 $PIDBIND\n"
c 10000000

con0.input "/usr/bin/nice --50 ./matrixB.rr B &\n"
con0.input "PIDBIND=`pgrep matrixB.rr`\n"
c 10000000
con0.input "pbind -b 4 $PIDBIND\n"
c 10000000

con0.input "/usr/bin/nice --50 ./matrixC.rr C &\n"
con0.input "PIDBIND=`pgrep matrixC.rr`\n"
c 10000000
con0.input "pbind -b 0 $PIDBIND\n"
c 10000000

con0.input "/usr/bin/nice --50 ./matrixD.rr D &\n"
con0.input "PIDBIND=`pgrep matrixD.rr`\n"
c 10000000
con0.input "pbind -b 6 $PIDBIND\n"
c 10000000

run


[← Prev in Thread] Current Thread [Next in Thread→]