瀏覽代碼

Add 3D option to gemm example

Samuel Thibault 4 年之前
父節點
當前提交
a6fc65a485

+ 149 - 33
examples/mult/xgemm.c

@@ -47,6 +47,7 @@ static unsigned niter = 10;
 static unsigned nsleeps = 1;
 static unsigned nslicesx = 4;
 static unsigned nslicesy = 4;
+static unsigned nslicesz = 4;
 #if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
 static unsigned xdim = 256;
 static unsigned ydim = 256;
@@ -59,6 +60,7 @@ static unsigned zdim = 960*4;
 static unsigned check = 0;
 static unsigned bound = 0;
 static unsigned print_hostname = 0;
+static unsigned tiled = 0;
 
 static TYPE *A, *B, *C;
 static starpu_data_handle_t A_handle, B_handle, C_handle;
@@ -131,6 +133,8 @@ static void init_problem_data(void)
 
 static void partition_mult_data(void)
 {
+	unsigned x, y, z;
+
 	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
 		ydim, ydim, zdim, sizeof(TYPE));
 	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
@@ -148,19 +152,45 @@ static void partition_mult_data(void)
 	horiz.filter_func = starpu_matrix_filter_block;
 	horiz.nchildren = nslicesy;
 
-	starpu_data_partition(B_handle, &vert);
-	starpu_data_partition(A_handle, &horiz);
+	if (tiled)
+	{
+		struct starpu_data_filter vertA;
+		memset(&vertA, 0, sizeof(vertA));
+		vertA.filter_func = starpu_matrix_filter_vertical_block;
+		vertA.nchildren = nslicesz;
+
+		struct starpu_data_filter horizB;
+		memset(&horizB, 0, sizeof(horizB));
+		horizB.filter_func = starpu_matrix_filter_block;
+		horizB.nchildren = nslicesz;
+
+		starpu_data_map_filters(A_handle, 2, &vertA, &horiz);
+		starpu_data_map_filters(B_handle, 2, &vert, &horizB);
+		starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+
+		for (y = 0; y < nslicesy; y++)
+		for (z = 0; z < nslicesz; z++)
+			starpu_data_set_coordinates(starpu_data_get_sub_data(A_handle, 2, z, y), 2, z, y);
+
+		for (x = 0; x < nslicesx; x++)
+		for (z = 0; z < nslicesz; z++)
+			starpu_data_set_coordinates(starpu_data_get_sub_data(B_handle, 2, x, z), 2, x, z);
+	}
+	else
+	{
+		starpu_data_partition(B_handle, &vert);
+		starpu_data_partition(A_handle, &horiz);
 
-	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+		starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+	}
 
-	unsigned x, y;
 	for (x = 0; x < nslicesx; x++)
 	for (y = 0; y < nslicesy; y++)
 		starpu_data_set_coordinates(starpu_data_get_sub_data(C_handle, 2, x, y), 2, x, y);
 }
 
 #ifdef STARPU_USE_CUDA
-static void cublas_mult(void *descr[], void *arg)
+static void cublas_mult(void *descr[], void *arg, const TYPE *beta)
 {
 	(void)arg;
 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
@@ -179,13 +209,23 @@ static void cublas_mult(void *descr[], void *arg)
 			CUBLAS_OP_N, CUBLAS_OP_N,
 			nxC, nyC, nyA,
 			&p1, subA, ldA, subB, ldB,
-			&v0, subC, ldC);
+			beta, subC, ldC);
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 }
+
+static void cublas_gemm0(void *descr[], void *arg)
+{
+	cublas_mult(descr, arg, &v0);
+}
+
+static void cublas_gemm(void *descr[], void *arg)
+{
+	cublas_mult(descr, arg, &p1);
+}
 #endif
 
-void cpu_mult(void *descr[], void *arg)
+void cpu_mult(void *descr[], void *arg, TYPE beta)
 {
 	(void)arg;
 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
@@ -205,7 +245,7 @@ void cpu_mult(void *descr[], void *arg)
 	if (worker_size == 1)
 	{
 		/* Sequential CPU task */
-		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, beta, subC, ldC);
 	}
 	else
 	{
@@ -220,24 +260,34 @@ void cpu_mult(void *descr[], void *arg)
 		TYPE *new_subB = &subB[block_size*rank];
 		TYPE *new_subC = &subC[block_size*rank];
 
-		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
+		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, beta, new_subC, ldC);
 	}
 }
 
+void cpu_gemm0(void *descr[], void *arg)
+{
+	cpu_mult(descr, arg, 0.);
+}
+
+void cpu_gemm(void *descr[], void *arg)
+{
+	cpu_mult(descr, arg, 1.);
+}
+
 static struct starpu_perfmodel starpu_gemm_model =
 {
 	.type = STARPU_HISTORY_BASED,
 	.symbol = STARPU_GEMM_STR(gemm)
 };
 
-static struct starpu_codelet cl =
+static struct starpu_codelet cl_gemm0 =
 {
 	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
 	.max_parallelism = INT_MAX,
-	.cpu_funcs = {cpu_mult},
-	.cpu_funcs_name = {"cpu_mult"},
+	.cpu_funcs = {cpu_gemm0},
+	.cpu_funcs_name = {"cpu_gemm0"},
 #ifdef STARPU_USE_CUDA
-	.cuda_funcs = {cublas_mult},
+	.cuda_funcs = {cublas_gemm0},
 #elif defined(STARPU_SIMGRID)
 	.cuda_funcs = {(void*)1},
 #endif
@@ -247,16 +297,39 @@ static struct starpu_codelet cl =
 	.model = &starpu_gemm_model
 };
 
+static struct starpu_codelet cl_gemm =
+{
+	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_gemm},
+	.cpu_funcs_name = {"cpu_gemm"},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cublas_gemm},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1},
+#endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &starpu_gemm_model
+};
+
 static void parse_args(int argc, char **argv)
 {
 	int i;
 	for (i = 1; i < argc; i++)
 	{
-		if (strcmp(argv[i], "-nblocks") == 0)
+		if (strcmp(argv[i], "-3d") == 0)
+		{
+			tiled = 1;
+		}
+
+		else if (strcmp(argv[i], "-nblocks") == 0)
 		{
 			char *argptr;
 			nslicesx = strtol(argv[++i], &argptr, 10);
 			nslicesy = nslicesx;
+			nslicesz = nslicesx;
 		}
 
 		else if (strcmp(argv[i], "-nblocksx") == 0)
@@ -271,6 +344,12 @@ static void parse_args(int argc, char **argv)
 			nslicesy = strtol(argv[++i], &argptr, 10);
 		}
 
+		else if (strcmp(argv[i], "-nblocksz") == 0)
+		{
+			char *argptr;
+			nslicesz = strtol(argv[++i], &argptr, 10);
+		}
+
 		else if (strcmp(argv[i], "-x") == 0)
 		{
 			char *argptr;
@@ -330,12 +409,12 @@ static void parse_args(int argc, char **argv)
 
 		else if (strcmp(argv[i], "-spmd") == 0)
 		{
-			cl.type = STARPU_SPMD;
+			cl_gemm0.type = STARPU_SPMD;
 		}
 
 		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
 		{
-			fprintf(stderr,"Usage: %s [-nblocks n] [-nblocksx x] [-nblocksy y] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd] [-hostname] [-nsleeps nsleeps]\n", argv[0]);
+			fprintf(stderr,"Usage: %s [-3d] [-nblocks n] [-nblocksx x] [-nblocksy y] [-nblocksz z] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd] [-hostname] [-nsleeps nsleeps]\n", argv[0]);
 			fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%u blocks, %u iterations, %u sleeps\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, niter, nsleeps);
 			exit(EXIT_SUCCESS);
 		}
@@ -385,32 +464,69 @@ int main(int argc, char **argv)
 
 		starpu_fxt_start_profiling();
 		start = starpu_timing_now();
+		starpu_sleep(1);
 
-		unsigned x, y, iter;
+		unsigned x, y, z, iter;
 		for (iter = 0; iter < niter; iter++)
 		{
-			for (x = 0; x < nslicesx; x++)
-			for (y = 0; y < nslicesy; y++)
+			if (tiled)
+			{
+				for (x = 0; x < nslicesx; x++)
+				for (y = 0; y < nslicesy; y++)
+				{
+					starpu_data_handle_t Ctile = starpu_data_get_sub_data(C_handle, 2, x, y);
+					for (z = 0; z < nslicesz; z++)
+					{
+						struct starpu_task *task = starpu_task_create();
+
+						if (z == 0)
+							task->cl = &cl_gemm0;
+						else
+							task->cl = &cl_gemm;
+
+						task->handles[0] = starpu_data_get_sub_data(A_handle, 2, z, y);
+						task->handles[1] = starpu_data_get_sub_data(B_handle, 2, x, z);
+						task->handles[2] = Ctile;
+
+						task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * (zdim/nslicesz);
+
+						ret = starpu_task_submit(task);
+						if (ret == -ENODEV)
+						{
+						     check = 0;
+						     ret = 77;
+						     goto enodev;
+						}
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+					}
+					starpu_data_wont_use(Ctile);
+				}
+			}
+			else
 			{
-				struct starpu_task *task = starpu_task_create();
+				for (x = 0; x < nslicesx; x++)
+				for (y = 0; y < nslicesy; y++)
+				{
+					struct starpu_task *task = starpu_task_create();
 
-				task->cl = &cl;
+					task->cl = &cl_gemm0;
 
-				task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
-				task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
-				task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
+					task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
+					task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
+					task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
 
-				task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * zdim;
+					task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * zdim;
 
-				ret = starpu_task_submit(task);
-				if (ret == -ENODEV)
-				{
-				     check = 0;
-				     ret = 77;
-				     goto enodev;
+					ret = starpu_task_submit(task);
+					if (ret == -ENODEV)
+					{
+					     check = 0;
+					     ret = 77;
+					     goto enodev;
+					}
+					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+					starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
 				}
-				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-				starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
 			}
 
 			starpu_task_wait_for_all();

+ 8 - 4
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.attila

@@ -29,7 +29,7 @@
 #####
 # Model for cpu0_impl0 (Comb3)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -38,6 +38,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	3.328725e+05   	1.185902e+04   	2.563119e+07   	8.542747e+12   	77
+24c84a50	11059200       	1.769472e+09   	8.321812e+04   	2.964755e+03   	6.407798e+06   	5.339217e+11   	77
 0b0b0ce8	3686400        	2.621440e+08   	1.421718e+04   	3.409134e+02   	9.098993e+05   	1.294364e+10   	64
 4220e23d	14745600       	2.097152e+09   	1.008105e+05   	2.361630e+03   	8.064841e+06   	8.134670e+11   	80
 
@@ -63,7 +64,7 @@ nan            	nan            	nan
 #####
 # Model for cuda0_impl0 (Comb0)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -72,6 +73,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	1.123499e+04   	6.785566e+01   	1.190909e+06   	1.338033e+10   	106
+24c84a50	11059200       	1.769472e+09   	2.808747e+03   	1.696392e+01   	2.977272e+05   	8.362706e+08   	106
 0b0b0ce8	3686400        	2.621440e+08   	6.738679e+02   	4.393713e+01   	6.873452e+04   	4.651489e+07   	102
 4220e23d	14745600       	2.097152e+09   	5.557425e+03   	3.241733e+02   	5.835297e+05   	3.253957e+09   	105
 
@@ -97,7 +99,7 @@ nan            	nan            	nan
 #####
 # Model for cuda1_impl0 (Comb2)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -106,6 +108,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	1.123077e+04   	9.504466e+01   	1.179231e+06   	1.324463e+10   	105
+24c84a50	11059200       	1.769472e+09   	2.807693e+03   	2.376116e+01   	2.948078e+05   	8.277894e+08   	105
 0b0b0ce8	3686400        	2.621440e+08   	6.672056e+02   	3.376608e+01   	6.805497e+04   	4.552295e+07   	102
 4220e23d	14745600       	2.097152e+09   	5.553764e+03   	3.500896e+02   	5.831453e+05   	3.251521e+09   	105
 
@@ -131,7 +134,7 @@ nan            	nan            	nan
 #####
 # Model for cuda2_impl0 (Comb1)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -140,6 +143,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	1.124174e+04   	2.629960e+01   	1.180383e+06   	1.326963e+10   	105
+24c84a50	11059200       	1.769472e+09   	2.810435e+03   	6.574900e+00   	2.950958e+05   	8.293519e+08   	105
 0b0b0ce8	3686400        	2.621440e+08   	6.002221e+02   	2.259043e+01   	6.242310e+04   	3.752080e+07   	104
 4220e23d	14745600       	2.097152e+09   	5.577722e+03   	1.615194e+02   	5.912385e+05   	3.300529e+09   	106
 

+ 18 - 9
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.idgraf

@@ -28,7 +28,7 @@
 #####
 # Model for cuda0_impl0 (Comb2)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -39,6 +39,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.801013e+02   	7.013561e+01   	4.760709e+04   	3.272198e+07   	70
 4220e23d	14745600       	2.097152e+09   	5.623635e+03   	5.419920e+02   	4.442672e+05   	2.521603e+09   	79
 492beed5	33177600       	7.077888e+09   	1.150361e+04   	5.884814e+02   	1.000814e+06   	1.154310e+10   	87
+24c84a50	11059200       	1.769472e+09   	2.875903e+03   	1.471204e+02   	2.502035e+05   	7.214438e+08   	87
 
 ####################
 # COMB_4
@@ -62,7 +63,7 @@ nan            	nan            	nan
 #####
 # Model for cuda1_impl0 (Comb4)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -73,6 +74,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.717051e+02   	6.137607e+01   	4.500424e+04   	3.048197e+07   	67
 4220e23d	14745600       	2.097152e+09   	5.648275e+03   	4.677390e+02   	4.575103e+05   	2.601865e+09   	81
 492beed5	33177600       	7.077888e+09   	1.157020e+04   	6.521027e+02   	1.018178e+06   	1.181795e+10   	88
+24c84a50	11059200       	1.769472e+09   	2.892550e+03   	1.630257e+02   	2.545445e+05   	7.386219e+08   	88
 
 ####################
 # COMB_6
@@ -96,7 +98,7 @@ nan            	nan            	nan
 #####
 # Model for cuda2_impl0 (Comb6)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -107,6 +109,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.265559e+02   	5.536840e+01   	4.824481e+04   	3.046412e+07   	77
 4220e23d	14745600       	2.097152e+09   	5.631203e+03   	4.767455e+02   	4.561275e+05   	2.586957e+09   	81
 492beed5	33177600       	7.077888e+09   	1.162826e+04   	6.757302e+02   	1.023286e+06   	1.193922e+10   	88
+24c84a50	11059200       	1.769472e+09   	2.907065e+03   	1.689325e+02   	2.558215e+05   	7.462012e+08   	88
 
 ####################
 # COMB_7
@@ -130,7 +133,7 @@ nan            	nan            	nan
 #####
 # Model for cuda3_impl0 (Comb7)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -141,6 +144,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.780899e+02   	4.241206e+01   	4.543202e+04   	3.092751e+07   	67
 4220e23d	14745600       	2.097152e+09   	5.857201e+03   	8.346836e+02   	4.744333e+05   	2.835284e+09   	81
 492beed5	33177600       	7.077888e+09   	1.150498e+04   	4.254093e+02   	9.894285e+05   	1.139892e+10   	86
+24c84a50	11059200       	1.769472e+09   	2.876245e+03   	1.063523e+02   	2.473571e+05   	7.124325e+08   	86
 
 ####################
 # COMB_0
@@ -164,7 +168,7 @@ nan            	nan            	nan
 #####
 # Model for cuda4_impl0 (Comb0)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -175,6 +179,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.759139e+02   	4.092799e+01   	4.190666e+04   	2.842915e+07   	62
 4220e23d	14745600       	2.097152e+09   	5.527477e+03   	2.733928e+02   	4.421982e+05   	2.450220e+09   	80
 492beed5	33177600       	7.077888e+09   	1.146770e+04   	1.768909e+02   	1.100899e+06   	1.262778e+10   	96
+24c84a50	11059200       	1.769472e+09   	2.866925e+03   	4.422272e+01   	2.752248e+05   	7.892362e+08   	96
 
 ####################
 # COMB_1
@@ -198,7 +203,7 @@ nan            	nan            	nan
 #####
 # Model for cuda5_impl0 (Comb1)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -209,6 +214,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.339465e+02   	7.125158e+01   	4.184047e+04   	2.685969e+07   	66
 4220e23d	14745600       	2.097152e+09   	5.624130e+03   	4.755864e+02   	4.668028e+05   	2.644133e+09   	83
 492beed5	33177600       	7.077888e+09   	1.149102e+04   	5.375188e+02   	1.114629e+06   	1.283625e+10   	97
+24c84a50	11059200       	1.769472e+09   	2.872755e+03   	1.343797e+02   	2.786572e+05   	8.022656e+08   	97
 
 ####################
 # COMB_3
@@ -232,7 +238,7 @@ nan            	nan            	nan
 #####
 # Model for cuda6_impl0 (Comb3)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -243,6 +249,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.389750e+02   	8.615382e+01   	4.728415e+04   	3.076266e+07   	74
 4220e23d	14745600       	2.097152e+09   	5.648331e+03   	5.220897e+02   	4.631632e+05   	2.638450e+09   	82
 492beed5	33177600       	7.077888e+09   	1.155069e+04   	5.660846e+02   	1.108866e+06   	1.283893e+10   	96
+24c84a50	11059200       	1.769472e+09   	2.887673e+03   	1.415212e+02   	2.772165e+05   	8.024331e+08   	96
 
 ####################
 # COMB_5
@@ -266,7 +273,7 @@ nan            	nan            	nan
 #####
 # Model for cuda7_impl0 (Comb5)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -277,6 +284,7 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	6.386625e+02   	8.094896e+01   	4.342905e+04   	2.818209e+07   	68
 4220e23d	14745600       	2.097152e+09   	5.638657e+03   	3.709019e+02   	4.454539e+05   	2.522630e+09   	79
 492beed5	33177600       	7.077888e+09   	1.144012e+04   	2.531108e+02   	1.109691e+06   	1.270122e+10   	97
+24c84a50	11059200       	1.769472e+09   	2.860030e+03   	6.327770e+01   	2.774228e+05   	7.938262e+08   	97
 
 ####################
 # COMB_8
@@ -300,7 +308,7 @@ nan            	nan            	nan
 #####
 # Model for cpu0_impl0 (Comb8)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -311,4 +319,5 @@ nan            	nan            	nan
 0b0b0ce8	3686400        	2.621440e+08   	1.414338e+04   	6.441210e+02   	3.535844e+05   	5.011251e+09   	25
 4220e23d	14745600       	2.097152e+09   	1.091117e+05   	2.701159e+03   	3.382462e+06   	3.692924e+11   	31
 492beed5	33177600       	7.077888e+09   	3.621356e+05   	7.764608e+03   	8.329119e+06   	3.017657e+12   	23
+24c84a50	11059200       	1.769472e+09   	9.053390e+04   	1.941152e+03   	2.082280e+06   	1.886036e+11   	23
 

+ 8 - 4
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.mirage

@@ -29,7 +29,7 @@
 #####
 # Model for cpu0_impl0 (Comb3)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -39,6 +39,7 @@ nan            	nan            	nan
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 0b0b0ce8	3686400        	2.621440e+08   	1.352609e+04   	3.616534e+02   	1.082087e+06   	1.464687e+10   	80
 492beed5	33177600       	7.077888e+09   	3.550396e+05   	8.949994e+03   	2.840317e+07   	1.009066e+13   	80
+24c84a50	11059200       	1.769472e+09   	8.875990e+04   	2.237499e+03   	7.100792e+06   	6.306662e+11   	80
 4220e23d	14745600       	2.097152e+09   	1.078112e+05   	1.983800e+03   	8.624897e+06   	9.301755e+11   	80
 
 ####################
@@ -63,7 +64,7 @@ nan            	nan            	nan
 #####
 # Model for cuda0_impl0 (Comb1)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -73,6 +74,7 @@ nan            	nan            	nan
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 0b0b0ce8	3686400        	2.621440e+08   	6.589631e+02   	8.406511e+00   	6.787320e+04   	4.473321e+07   	103
 492beed5	33177600       	7.077888e+09   	1.151398e+04   	9.050114e+01   	1.220482e+06   	1.405348e+10   	106
+24c84a50	11059200       	1.769472e+09   	2.878495e+03   	2.262529e+01   	3.051205e+05   	8.783425e+08   	106
 4220e23d	14745600       	2.097152e+09   	5.574713e+03   	3.353004e+02   	5.909196e+05   	3.306125e+09   	106
 
 ####################
@@ -97,7 +99,7 @@ nan            	nan            	nan
 #####
 # Model for cuda1_impl0 (Comb0)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -107,6 +109,7 @@ nan            	nan            	nan
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 0b0b0ce8	3686400        	2.621440e+08   	6.663664e+02   	8.616537e+01   	6.796937e+04   	4.604980e+07   	102
 492beed5	33177600       	7.077888e+09   	1.150036e+04   	8.404527e+01   	1.207538e+06   	1.388786e+10   	105
+24c84a50	11059200       	1.769472e+09   	2.875090e+03   	2.101132e+01   	3.018845e+05   	8.679912e+08   	105
 4220e23d	14745600       	2.097152e+09   	5.579034e+03   	3.672012e+02   	5.857985e+05   	3.282348e+09   	105
 
 ####################
@@ -131,7 +134,7 @@ nan            	nan            	nan
 #####
 # Model for cuda2_impl0 (Comb2)
 # number of entries
-3
+4
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -141,5 +144,6 @@ nan            	nan            	nan
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 0b0b0ce8	3686400        	2.621440e+08   	6.181769e+02   	5.174143e+01   	6.181769e+04   	3.848198e+07   	100
 492beed5	33177600       	7.077888e+09   	1.148096e+04   	7.289415e+01   	1.205501e+06   	1.384086e+10   	105
+24c84a50	11059200       	1.769472e+09   	2.870240e+03   	1.822354e+01   	3.013752e+05   	8.650538e+08   	105
 4220e23d	14745600       	2.097152e+09   	5.580581e+03   	3.970717e+02   	5.859610e+05   	3.286558e+09   	105
 

+ 10 - 5
tools/perfmodels/sampling/codelets/45/starpu_sgemm_gemm.sirocco

@@ -28,7 +28,7 @@
 #####
 # Model for cuda0_impl0 (Comb2)
 # number of entries
-4
+5
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -37,6 +37,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	2.745578e+03   	3.064191e+02   	6.616844e+05   	1.839335e+09   	241
+24c84a50	11059200       	1.769472e+09   	6.863945e+02   	7.660478e+01   	1.654211e+05   	1.149584e+08   	241
 0b0b0ce8	3686400        	2.621440e+08   	1.582927e+02   	3.333442e+01   	3.434951e+04   	5.678402e+06   	217
 4220e23d	14745600       	2.097152e+09   	8.206871e+02   	1.017181e+02   	1.148962e+05   	9.574235e+07   	140
 87a7dc42	74649600       	2.388787e+10   	9.813897e+03   	7.998509e+02   	1.570224e+06   	1.551237e+10   	160
@@ -63,7 +64,7 @@ nan            	nan            	nan
 #####
 # Model for cuda3_impl0 (Comb1)
 # number of entries
-4
+5
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -72,6 +73,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	2.686428e+03   	2.002215e+02   	6.716071e+05   	1.814247e+09   	250
+24c84a50	11059200       	1.769472e+09   	6.716070e+02   	5.005537e+01   	1.679018e+05   	1.133904e+08   	250
 0b0b0ce8	3686400        	2.621440e+08   	1.630480e+02   	3.438768e+01   	3.097912e+04   	5.275762e+06   	190
 4220e23d	14745600       	2.097152e+09   	8.448030e+02   	7.773742e+01   	2.433033e+05   	2.072837e+08   	288
 87a7dc42	74649600       	2.388787e+10   	9.873153e+03   	8.026227e+02   	1.579704e+06   	1.569974e+10   	160
@@ -98,7 +100,7 @@ nan            	nan            	nan
 #####
 # Model for cuda1_impl0 (Comb0)
 # number of entries
-4
+5
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -107,6 +109,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	2.791098e+03   	3.147711e+02   	6.503258e+05   	1.838209e+09   	233
+24c84a50	11059200       	1.769472e+09   	6.977745e+02   	7.869277e+01   	1.625815e+05   	1.148881e+08   	233
 0b0b0ce8	3686400        	2.621440e+08   	1.624855e+02   	3.298013e+01   	2.940987e+04   	4.975550e+06   	181
 4220e23d	14745600       	2.097152e+09   	8.152506e+02   	1.017614e+02   	1.173961e+05   	9.719839e+07   	144
 87a7dc42	74649600       	2.388787e+10   	1.001360e+04   	7.827579e+02   	1.582149e+06   	1.593981e+10   	158
@@ -133,7 +136,7 @@ nan            	nan            	nan
 #####
 # Model for cuda2_impl0 (Comb3)
 # number of entries
-4
+5
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -142,6 +145,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	2.754203e+03   	2.682327e+02   	6.830422e+05   	1.899080e+09   	248
+24c84a50	11059200       	1.769472e+09   	6.885507e+02   	6.705818e+01   	1.707605e+05   	1.186925e+08   	248
 0b0b0ce8	3686400        	2.621440e+08   	1.622246e+02   	3.553894e+01   	3.714942e+04   	6.315779e+06   	229
 4220e23d	14745600       	2.097152e+09   	8.611626e+02   	9.290485e+01   	2.411255e+05   	2.100651e+08   	280
 87a7dc42	74649600       	2.388787e+10   	9.935915e+03   	7.366769e+02   	1.569875e+06   	1.568389e+10   	158
@@ -168,7 +172,7 @@ nan            	nan            	nan
 #####
 # Model for cpu0_impl0 (Comb4)
 # number of entries
-4
+5
 # sumlnx	sumlnx2		sumlny		sumlnxlny	alpha		beta		n	minx		maxx
 0.000000e+00   	0.000000e+00   	0.000000e+00   	0.000000e+00   	nan            	nan            	0	0              	0              
 # a		b		c
@@ -177,6 +181,7 @@ nan            	nan            	nan
 0
 # hash		size		flops		mean (us)	dev (us)	sum		sum2		n
 492beed5	33177600       	7.077888e+09   	1.712078e+05   	4.163047e+04   	2.773567e+07   	5.029326e+12   	162
+24c84a50	11059200       	1.769472e+09   	4.280195e+04   	1.040762e+04   	6.933918e+06   	3.143329e+11   	162
 0b0b0ce8	3686400        	2.621440e+08   	6.441655e+03   	1.152866e+03   	3.220827e+05   	2.141201e+09   	50
 4220e23d	14745600       	2.097152e+09   	4.927734e+04   	1.166029e+04   	5.913281e+06   	3.077063e+11   	120
 87a7dc42	74649600       	2.388787e+10   	5.091210e+05   	1.022002e+05   	6.974957e+07   	3.694192e+13   	137