|
@@ -14,14 +14,16 @@
|
|
|
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
|
|
|
*/
|
|
|
|
|
|
+#include "simple.h"
|
|
|
#include "dw_mult.h"
|
|
|
#ifdef STARPU_USE_GORDON
|
|
|
#include "gordon/func_sgemm_ibm.h"
|
|
|
#endif
|
|
|
+#include "xgemm_kernels.c"
|
|
|
|
|
|
-float *A[MAXSLICESY][MAXSLICESZ];
|
|
|
-float *B[MAXSLICESZ][MAXSLICESX];
|
|
|
-float *C[MAXSLICESY][MAXSLICESX];
|
|
|
+TYPE *A[MAXSLICESY][MAXSLICESZ];
|
|
|
+TYPE *B[MAXSLICESZ][MAXSLICESX];
|
|
|
+TYPE *C[MAXSLICESY][MAXSLICESX];
|
|
|
|
|
|
starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
|
|
|
starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
|
|
@@ -33,7 +35,10 @@ starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
|
|
|
static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
|
|
|
|
|
|
/*
|
|
|
- * That program should compute C = A * B
|
|
|
+ * This program computes C = A * B
|
|
|
+ *
|
|
|
+ * The difference with xgemm.c is that matrices are here already split in
|
|
|
+ * blocks, and thus no data partitioning is needed.
|
|
|
*
|
|
|
* A of size (z,y)
|
|
|
* B of size (x,z)
|
|
@@ -59,6 +64,14 @@ static void init_problem_data(void)
|
|
|
{
|
|
|
unsigned i,j;
|
|
|
|
|
|
+ /* debug ... */
|
|
|
+ memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
|
|
|
+ memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
|
|
|
+ memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
|
|
|
+ memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
|
|
|
+ memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
|
|
|
+ memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
|
|
|
+
|
|
|
/* Allocate grids of buffer */
|
|
|
/* TODO pin ... */
|
|
|
unsigned z, y, x;
|
|
@@ -68,9 +81,9 @@ static void init_problem_data(void)
|
|
|
for (z = 0; z < nslicesz; z++)
|
|
|
{
|
|
|
#ifdef STARPU_HAVE_POSIX_MEMALIGN
|
|
|
- posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
|
|
|
+ posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
|
|
|
#else
|
|
|
- A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
|
|
|
+ A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
|
|
|
#endif
|
|
|
assert(A[y][z]);
|
|
|
}
|
|
@@ -81,9 +94,9 @@ static void init_problem_data(void)
|
|
|
for (x = 0; x < nslicesx; x++)
|
|
|
{
|
|
|
#ifdef STARPU_HAVE_POSIX_MEMALIGN
|
|
|
- posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
|
|
|
+ posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
|
|
|
#else
|
|
|
- B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
|
|
|
+ B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
|
|
|
#endif
|
|
|
assert(B[z][x]);
|
|
|
}
|
|
@@ -94,9 +107,9 @@ static void init_problem_data(void)
|
|
|
for (x = 0; x < nslicesx; x++)
|
|
|
{
|
|
|
#ifdef STARPU_HAVE_POSIX_MEMALIGN
|
|
|
- posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
|
|
|
+ posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
|
|
|
#else
|
|
|
- C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
|
|
|
+ C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
|
|
|
#endif
|
|
|
assert(C[y][x]);
|
|
|
}
|
|
@@ -111,7 +124,7 @@ static void init_problem_data(void)
|
|
|
for (j = 0; j < BLOCKSIZEY; j++)
|
|
|
for (i = 0; i < BLOCKSIZEZ; i++)
|
|
|
{
|
|
|
- A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
|
|
|
+ A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
|
|
|
}
|
|
|
|
|
|
for (blockz = 0; blockz < nslicesz; blockz++)
|
|
@@ -119,7 +132,7 @@ static void init_problem_data(void)
|
|
|
for (j = 0; j < BLOCKSIZEZ; j++)
|
|
|
for (i = 0; i < BLOCKSIZEX; i++)
|
|
|
{
|
|
|
- B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
|
|
|
+ B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
|
|
|
}
|
|
|
}
|
|
|
else {
|
|
@@ -128,7 +141,7 @@ static void init_problem_data(void)
|
|
|
for (j = 0; j < BLOCKSIZEY; j++)
|
|
|
for (i = 0; i < BLOCKSIZEZ; i++)
|
|
|
{
|
|
|
- A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(starpu_drand48());
|
|
|
+ A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
|
|
|
}
|
|
|
|
|
|
for (blockz = 0; blockz < nslicesz; blockz++)
|
|
@@ -136,7 +149,7 @@ static void init_problem_data(void)
|
|
|
for (j = 0; j < BLOCKSIZEZ; j++)
|
|
|
for (i = 0; i < BLOCKSIZEX; i++)
|
|
|
{
|
|
|
- B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(starpu_drand48());
|
|
|
+ B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
|
|
|
}
|
|
|
|
|
|
}
|
|
@@ -146,9 +159,11 @@ static void init_problem_data(void)
|
|
|
for (j = 0; j < BLOCKSIZEY; j++)
|
|
|
for (i = 0; i < BLOCKSIZEX; i++)
|
|
|
{
|
|
|
- C[blocky][blockx][i*BLOCKSIZEY + j] = (float)(blockx + blocky*nslicesx + 1);
|
|
|
+ C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
|
|
|
}
|
|
|
|
|
|
+ /* TODO: aren't we supposed to set data consistency to relaxed, since
|
|
|
+ * tags are supposed to provide the correct dependencies? */
|
|
|
|
|
|
/* declare the StarPU data to monitor */
|
|
|
for (y = 0; y < nslicesy; y++)
|
|
@@ -156,7 +171,7 @@ static void init_problem_data(void)
|
|
|
for (z = 0; z < nslicesz; z++)
|
|
|
{
|
|
|
starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z],
|
|
|
- BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
|
|
|
+ BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -165,7 +180,7 @@ static void init_problem_data(void)
|
|
|
for (x = 0; x < nslicesx; x++)
|
|
|
{
|
|
|
starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x],
|
|
|
- BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
|
|
|
+ BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -174,7 +189,7 @@ static void init_problem_data(void)
|
|
|
for (x = 0; x < nslicesx; x++)
|
|
|
{
|
|
|
starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x],
|
|
|
- BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
|
|
|
+ BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -184,6 +199,8 @@ static void init_problem_data(void)
|
|
|
conf.n = BLOCKSIZEX;
|
|
|
#endif
|
|
|
|
|
|
+ fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
|
|
|
+
|
|
|
display_memory_consumption();
|
|
|
}
|
|
|
|
|
@@ -191,6 +208,41 @@ static void cleanup_problem(void)
|
|
|
{
|
|
|
unsigned z, y, x;
|
|
|
|
|
|
+#ifdef CHECK_OUTPUT
|
|
|
+ TYPE maxerr = 0.0;
|
|
|
+ TYPE err;
|
|
|
+ fprintf(stderr, "Checking results ....");
|
|
|
+
|
|
|
+ for (y = 0; y < nslicesy; y++)
|
|
|
+ {
|
|
|
+ for (x = 0; x < nslicesx; x++)
|
|
|
+ {
|
|
|
+ for (z = 0; z < nslicesz; z++)
|
|
|
+ {
|
|
|
+ SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ /* make sure C - niter AB = 0 */
|
|
|
+ err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
|
|
|
+
|
|
|
+ if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
|
|
|
+ fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
|
|
|
+
|
|
|
+ maxerr = STARPU_MAX(err, maxerr);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
|
|
|
+ {
|
|
|
+ fprintf(stderr, " maxerr = %f\n", maxerr/niter);
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ fprintf(stderr, " OK\n");
|
|
|
+ }
|
|
|
+ fflush(stderr);
|
|
|
+#endif
|
|
|
+
|
|
|
for (y = 0; y < nslicesy; y++)
|
|
|
{
|
|
|
for (z = 0; z < nslicesz; z++)
|
|
@@ -226,20 +278,24 @@ struct cb2_s {
|
|
|
unsigned iter;
|
|
|
};
|
|
|
|
|
|
+
|
|
|
static starpu_codelet cl = {
|
|
|
- .cpu_func = cpu_mult,
|
|
|
+ .where = STARPU_CPU|STARPU_CUDA
|
|
|
+#ifdef SPU_FUNC_SGEMM
|
|
|
+ |STARPU_GORDON
|
|
|
+#endif
|
|
|
+ ,
|
|
|
+ .cpu_func = STARPU_GEMM(cpu_mult),
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = cublas_mult,
|
|
|
+ .cuda_func = STARPU_GEMM(cublas_mult),
|
|
|
#endif
|
|
|
#ifdef STARPU_USE_GORDON
|
|
|
/* .gordon_func will be set by load_elf_sgemm */
|
|
|
#endif
|
|
|
-
|
|
|
- .model = &sgemm_model,
|
|
|
- .where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
|
|
|
.nbuffers = 3
|
|
|
};
|
|
|
|
|
|
+
|
|
|
#ifdef STARPU_USE_GORDON
|
|
|
static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
|
|
|
static unsigned spu_func_sgemm_elf_id;
|
|
@@ -252,26 +308,20 @@ static void load_elf_sgemm(void)
|
|
|
|
|
|
spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
|
|
|
|
|
|
-
|
|
|
gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
|
|
|
gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
|
|
|
|
|
|
cl.gordon_func = spu_func_sgemm_ibm_id;
|
|
|
}
|
|
|
-
|
|
|
#endif // STARPU_USE_GORDON
|
|
|
|
|
|
static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
|
|
|
{
|
|
|
+ /* A B[task] = C[task] */
|
|
|
struct starpu_task *task = starpu_task_create();
|
|
|
|
|
|
task->cl = &cl;
|
|
|
|
|
|
-#ifdef STARPU_USE_GORDON
|
|
|
- task->cl_arg = &conf;
|
|
|
- task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
|
|
|
-#endif
|
|
|
-
|
|
|
task->use_tag = 1;
|
|
|
task->tag_id = TAG(z, y, x, iter);
|
|
|
|
|
@@ -282,6 +332,11 @@ static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, un
|
|
|
task->buffers[2].handle = C_state[y][x];
|
|
|
task->buffers[2].mode = STARPU_RW;
|
|
|
|
|
|
+#ifdef STARPU_USE_GORDON
|
|
|
+ task->cl_arg = &conf;
|
|
|
+ task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
|
|
|
+#endif
|
|
|
+
|
|
|
return task;
|
|
|
}
|
|
|
|
|
@@ -355,12 +410,21 @@ static void launch_codelets(void)
|
|
|
|
|
|
srand(time(NULL));
|
|
|
|
|
|
- gettimeofday(&start, NULL);
|
|
|
+ /* should we use a single performance model for all archs and use an
|
|
|
+ * acceleration factor ? */
|
|
|
+ if (use_common_model) {
|
|
|
+ cl.model = &STARPU_GEMM(model_common);
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ cl.model = &STARPU_GEMM(model);
|
|
|
+ }
|
|
|
|
|
|
for (taskx = 0; taskx < nslicesx; taskx++)
|
|
|
- for (tasky = 0; tasky < nslicesy; tasky++)
|
|
|
{
|
|
|
- submit_new_iter(taskx, tasky, 0);
|
|
|
+ for (tasky = 0; tasky < nslicesy; tasky++)
|
|
|
+ {
|
|
|
+ submit_new_iter(taskx, tasky, 0);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -381,16 +445,16 @@ int main(__attribute__ ((unused)) int argc,
|
|
|
|
|
|
init_problem_data();
|
|
|
|
|
|
+ gettimeofday(&start, NULL);
|
|
|
+
|
|
|
launch_codelets();
|
|
|
|
|
|
starpu_task_wait_for_all();
|
|
|
|
|
|
gettimeofday(&end, NULL);
|
|
|
-
|
|
|
double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
display_stats(timing);
|
|
|
|
|
|
-
|
|
|
cleanup_problem();
|
|
|
|
|
|
starpu_helper_cublas_shutdown();
|