Selaa lähdekoodia

Use a coherent m,n ordering in all cholesky codes.

Avoid i,j which is ambiguous.
Samuel Thibault 5 vuotta sitten
vanhempi
commit
49da257213

+ 45 - 40
examples/cholesky/cholesky_compil.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015                           Inria
- * Copyright (C) 2009-2017,2019                           Université de Bordeaux
+ * Copyright (C) 2009-2017,2019-2020                           Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2010                                     Mehdi Juhoor
@@ -24,6 +24,9 @@
  * compiler-side optimizations.
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 #include <math.h>
@@ -50,8 +53,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	unsigned long nelems = starpu_matrix_get_nx(dataA);
 	unsigned long nn = nelems/nblocks;
-	int N = nblocks;
 	int M = nblocks;
+	int N = nblocks;
 
 	int lambda_b = starpu_get_env_float_default("CHOLESKY_LAMBDA_B", nblocks);
 	int lambda_o_u = starpu_get_env_float_default("CHOLESKY_LAMBDA_O_U", 0);
@@ -70,7 +73,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 #define ceild(n,d)  ceil(((double)(n))/((double)(d)))
 #define floord(n,d) floor(((double)(n))/((double)(d)))
 
-#define A(i,j) starpu_data_get_sub_data(dataA, 2, j, i)
+#define A(i,j) starpu_data_get_sub_data(dataA, 2, i, j)
 
 #define _POTRF(cl, A, prio, name) do { \
 		int ret = starpu_task_insert(cl, \
@@ -204,31 +207,34 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
 	starpu_data_handle_t dataA;
-	unsigned x, y;
+	unsigned m, n;
 
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
-	for (x = 0; x < nblocks; x++)
-		for (y = 0; y < nblocks; y++)
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks; n++)
 		{
-			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, x, y);
-			starpu_data_set_coordinates(data, 2, x, y);
+			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, m, n);
+			starpu_data_set_coordinates(data, 2, m, n);
 		}
 
 	int ret = _cholesky(dataA, nblocks);
@@ -244,14 +250,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	float *mat = NULL;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
+	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-	for (i = 0; i < size; i++)
+	for (n = 0; n < size; n++)
 	{
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+			mat[m +n*size] = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+			/* mat[m +n*size] = ((m == n)?1.0f*size:0.0f); */
 		}
 	}
 
@@ -259,13 +265,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
@@ -282,18 +288,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifndef STARPU_SIMGRID
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
@@ -303,13 +308,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	if (check_p)
 	{
 		FPRINTF(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i > j)
+				if (n > m)
 				{
-					mat[j+i*size] = 0.0f; /* debug */
+					mat[m+n*size] = 0.0f; /* debug */
 				}
 			}
 		}
@@ -321,13 +326,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size]);
 				}
 				else
 				{
@@ -338,17 +343,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		}
 #endif
 
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = fabsf(test_mat[j +i*size] - orig) / orig;
-	                                if (err > 0.00001)
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+	                                float err = fabsf(test_mat[m +n*size] - orig) / orig;
+	                                if (err > 0.0001)
 					{
-	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size], orig, err);
 	                                        assert(0);
 	                                }
 	                        }

+ 115 - 73
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -28,6 +28,9 @@
  * remainder of the matrix with a smaller granularity.
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
@@ -64,7 +67,8 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
 	/* this is an important task */
-	task->priority = STARPU_MAX_PRIO;
+	if (!noprio_p)
+		task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
 	if (k > 0)
@@ -78,19 +82,19 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
+static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned reclevel)
 {
 	int ret;
 
-	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
+	struct starpu_task *task = create_task(TAG21_AUX(k, m, reclevel));
 
 	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
 
-	if (j == k+1)
+	if (!noprio_p && (m == k+1))
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -98,37 +102,37 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
+		starpu_tag_declare_deps(TAG21_AUX(k, m, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, m, k, reclevel));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
+		starpu_tag_declare_deps(TAG21_AUX(k, m, reclevel), 1, TAG11_AUX(k, reclevel));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_STRSM(n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(nx, nx);
 
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
 }
 
-static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned n, unsigned reclevel)
 {
 	int ret;
 
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\nx", k,m,n, TAG22_AUX(k,m,n)); */
 
-	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
+	struct starpu_task *task = create_task(TAG22_AUX(k, m, n, reclevel));
 
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
-	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, n, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, m, n);
 
-	if ( (i == k + 1) && (j == k +1) )
+	if ( (n == k + 1) && (m == k +1) )
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -136,15 +140,15 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+		starpu_tag_declare_deps(TAG22_AUX(k, m, n, reclevel), 3, TAG22_AUX(k-1, m, n, reclevel), TAG21_AUX(k, n, reclevel), TAG21_AUX(k, m, reclevel));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+		starpu_tag_declare_deps(TAG22_AUX(k, m, n, reclevel), 2, TAG21_AUX(k, n, reclevel), TAG21_AUX(k, m, reclevel));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -166,7 +170,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 	starpu_data_handle_t dataA;
 
@@ -176,15 +180,18 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
@@ -206,16 +213,16 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-		     	ret = create_task_21(dataA, k, j, reclevel);
+		     	ret = create_task_21(dataA, k, m, reclevel);
 			if (ret == -ENODEV) return 77;
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-				     ret = create_task_22(dataA, k, i, j, reclevel);
+				     ret = create_task_22(dataA, k, m, n, reclevel);
 				     if (ret == -ENODEV) return 77;
 				}
 			}
@@ -248,11 +255,11 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 		STARPU_ASSERT(tag_array);
 
 		unsigned ind = 0;
-		for (i = nbigblocks; i < nblocks; i++)
-		for (j = nbigblocks; j < nblocks; j++)
+		for (n = nbigblocks; n < nblocks; n++)
+		for (m = nbigblocks; m < nblocks; m++)
 		{
-			if (i <= j)
-				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
+			if (n <= m)
+				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, m, n, reclevel);
 		}
 
 		starpu_tag_wait_array(ind, tag_array);
@@ -268,7 +275,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	}
 }
 
-static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
+static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
 {
 	int ret;
 	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
@@ -279,7 +286,7 @@ static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
 
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
-		exit(77);
+		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	init_sizes();
@@ -301,6 +308,8 @@ static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
 	if (pinned)
 		flags |= STARPU_MALLOC_PINNED;
 	starpu_malloc_flags((void **)A, size_p*size_p*sizeof(float), flags);
+
+	return 0;
 }
 
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
@@ -344,34 +353,33 @@ int main(int argc, char **argv)
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
 	 * */
 
-     	int ret;
-
 	float *mat = NULL;
-	initialize_system(argc, argv, &mat, pinned_p);
+	int ret = initialize_system(argc, argv, &mat, pinned_p);
+	if (ret) return ret;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
-	for (i = 0; i < size_p; i++)
+	unsigned m,n;
+
+	for (n = 0; n < size_p; n++)
 	{
-		for (j = 0; j < size_p; j++)
+		for (m = 0; m < size_p; m++)
 		{
-			mat[j +i*size_p] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size_p:0.0f);
-			/* mat[j +i*size_p] = ((i == j)?1.0f*size_p:0.0f); */
+			mat[m +n*size_p] = (1.0f/(1.0f+n+m)) + ((n == m)?1.0f*size_p:0.0f);
+			/* mat[m +n*size_p] = ((n == m)?1.0f*size_p:0.0f); */
 		}
 	}
-#endif
 
-
-#ifdef CHECK_OUTPUT
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
@@ -381,55 +389,89 @@ int main(int argc, char **argv)
 		FPRINTF(stdout, "\n");
 	}
 #endif
+#endif
 
 	ret = cholesky_grain(mat, size_p, size_p, nblocks_p, nbigblocks_p);
 
-#ifdef CHECK_OUTPUT
+#ifndef STARPU_SIMGRID
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size_p] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
+#endif
 
-	FPRINTF(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size_p*size_p*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
-		     mat, size_p, 0.0f, test_mat, size_p);
-
-	FPRINTF(stderr, "comparing results ...\n");
-	for (j = 0; j < size_p; j++)
+	if (check_p)
 	{
-		for (i = 0; i < size_p; i++)
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (m = 0; m < size_p; m++)
 		{
-			if (i <= j)
+			for (n = 0; n < size_p; n++)
 			{
-                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size_p]);
+				if (n > m)
+				{
+					mat[m+n*size_p] = 0.0f; /* debug */
+				}
 			}
-			else
+		}
+		float *test_mat = malloc(size_p*size_p*sizeof(float));
+		STARPU_ASSERT(test_mat);
+
+		STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
+			     mat, size_p, 0.0f, test_mat, size_p);
+
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
 			{
-				FPRINTF(stdout, ".\t");
+				if (n <= m)
+				{
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size_p]);
+				}
+				else
+				{
+					FPRINTF(stdout, ".\t");
+				}
 			}
+			FPRINTF(stdout, "\n");
 		}
-		FPRINTF(stdout, "\n");
+#endif
+
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
+			{
+				if (n <= m)
+				{
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size_p:0.0f);
+	                                float err = fabsf(test_mat[m +n*size_p] - orig) / orig;
+	                                if (err > 0.0001)
+					{
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size_p], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+		free(test_mat);
 	}
-	free(test_mat);
 #endif
 
 	shutdown_system(&mat, size_p, pinned_p);
-	return ret;
+	return 0;
 }

+ 63 - 58
examples/cholesky/cholesky_implicit.c

@@ -23,6 +23,9 @@
  * The whole algorithm thus appears clearly in the task submission loop in _cholesky().
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 
@@ -46,9 +49,9 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	double start;
 	double end;
 
-	unsigned i,j,k;
-	unsigned long n = starpu_matrix_get_nx(dataA);
-	unsigned long nn = n/nblocks;
+	unsigned k,m,n;
+	unsigned long nx = starpu_matrix_get_nx(dataA);
+	unsigned long nn = nx/nblocks;
 
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
@@ -75,45 +78,45 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		if (ret == -ENODEV) return 77;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
+                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
 
                         ret = starpu_task_insert(&cl21,
-						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
-						 STARPU_RW, sdatakj,
+						 STARPU_RW, sdatamk,
 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
-						 STARPU_TAG_ONLY, TAG21(k,j),
+						 STARPU_TAG_ONLY, TAG21(m,k),
 						 0);
 			if (ret == -ENODEV) return 77;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 		starpu_data_wont_use(sdatakk);
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
-			for (i = k+1; i<nblocks; i++)
+                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
+				if (n <= m)
                                 {
-					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
-					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
+					starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
+					starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
 
 					ret = starpu_task_insert(&cl22,
-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-								 STARPU_R, sdataki,
-								 STARPU_R, sdatakj,
-								 cl22.modes[2], sdataij,
+								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+								 STARPU_R, sdatamk,
+								 STARPU_R, sdatank,
+								 cl22.modes[2], sdatamn,
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
-								 STARPU_TAG_ONLY, TAG22(k,i,j),
+								 STARPU_TAG_ONLY, TAG22(k,m,n),
 								 0);
 					if (ret == -ENODEV) return 77;
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
                                 }
 			}
-			starpu_data_wont_use(sdatakj);
+			starpu_data_wont_use(sdatamk);
 		}
 		starpu_iteration_pop();
 	}
@@ -128,7 +131,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	double timing = end - start;
 
-	double flop = FLOPS_SPOTRF(n);
+	double flop = FLOPS_SPOTRF(nx);
 
 	if(with_ctxs_p || with_noctxs_p || chole1_p || chole2_p)
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
@@ -139,7 +142,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			PRINTF("\tTms\tTGFlops");
 		PRINTF("\n");
 
-		PRINTF("%lu\t%.0f\t%.1f", n, timing/1000, (flop/timing/1000.0f));
+		PRINTF("%lu\t%.0f\t%.1f", nx, timing/1000, (flop/timing/1000.0f));
 		if (bound_lp_p)
 		{
 			FILE *f = fopen("cholesky.lp", "w");
@@ -166,31 +169,34 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
 	starpu_data_handle_t dataA;
-	unsigned x, y;
+	unsigned m, n;
 
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
-	for (x = 0; x < nblocks; x++)
-		for (y = 0; y < nblocks; y++)
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks; n++)
 		{
-			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, x, y);
-			starpu_data_set_coordinates(data, 2, x, y);
+			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, m, n);
+			starpu_data_set_coordinates(data, 2, m, n);
 		}
 
 	int ret = _cholesky(dataA, nblocks);
@@ -206,14 +212,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	float *mat = NULL;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
+	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-	for (i = 0; i < size; i++)
+	for (n = 0; n < size; n++)
 	{
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+			mat[m +n*size] = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+			/* mat[m +n*size] = ((m == n)?1.0f*size:0.0f); */
 		}
 	}
 
@@ -221,13 +227,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
@@ -244,18 +250,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifndef STARPU_SIMGRID
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
@@ -265,13 +270,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	if (check_p)
 	{
 		FPRINTF(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i > j)
+				if (n > m)
 				{
-					mat[j+i*size] = 0.0f; /* debug */
+					mat[m+n*size] = 0.0f; /* debug */
 				}
 			}
 		}
@@ -283,13 +288,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size]);
 				}
 				else
 				{
@@ -300,17 +305,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		}
 #endif
 
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = fabsf(test_mat[j +i*size] - orig) / orig;
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+	                                float err = fabsf(test_mat[m +n*size] - orig) / orig;
 	                                if (err > 0.0001)
 					{
-	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size], orig, err);
 	                                        assert(0);
 	                                }
 	                        }

+ 127 - 90
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -24,6 +24,9 @@
  * It also uses data partitioning to split the matrix into submatrices
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include <starpu_perfmodel.h>
 
@@ -76,17 +79,19 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned m)
 {
-	struct starpu_task *task = create_task(TAG21(k, j));
+	int ret;
+
+	struct starpu_task *task = create_task(TAG21(k, m));
 
 	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
 
-	if (!noprio_p && (j == k+1))
+	if (!noprio_p && (m == k+1))
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -94,39 +99,37 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+		starpu_tag_declare_deps(TAG21(k, m), 2, TAG11(k), TAG22(k-1, m, k));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+		starpu_tag_declare_deps(TAG21(k, m), 1, TAG11(k));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_STRSM(n, n);
-
-	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(nx, nx);
 
+	ret = starpu_task_submit(task);
+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return ret;
 }
 
-static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
+static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned n)
 {
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+	int ret;
+
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\n", k,m,n, TAG22(k,m,n)); */
 
-	struct starpu_task *task = create_task(TAG22(k, i, j));
+	struct starpu_task *task = create_task(TAG22(k, m, n));
 
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
-	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, n, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, m, n);
 
-	if (!noprio_p && (i == k + 1) && (j == k +1) )
+	if (!noprio_p && (n == k + 1) && (m == k +1) )
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -134,26 +137,21 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 3, TAG22(k-1, m, n), TAG21(k, n), TAG21(k, m));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 2, TAG21(k, n), TAG21(k, m));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
-	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
+	ret = starpu_task_submit(task);
+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return ret;
 }
 
-
-
 /*
  *	code to bootstrap the factorization
  *	and construct the DAG
@@ -161,13 +159,15 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
 static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
+	int ret;
+
 	double start;
 	double end;
 
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 	start = starpu_timing_now();
 
@@ -191,27 +191,27 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 		}
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-			create_task_21(dataA, k, j);
+			ret = create_task_21(dataA, k, m);
+			if (ret == -ENODEV) return 77;
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
-					create_task_22(dataA, k, i, j);
+				if (n <= m)
+				{
+					ret = create_task_22(dataA, k, m, n);
+					if (ret == -ENODEV) return 77;
+				}
 			}
 		}
 		starpu_iteration_pop();
 	}
 
 	/* schedule the codelet */
-	int ret = starpu_task_submit(entry_task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
-
+	ret = starpu_task_submit(entry_task);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
@@ -220,15 +220,16 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	end = starpu_timing_now();
 
-
 	double timing = end - start;
 
-	unsigned n = starpu_matrix_get_nx(dataA);
+	unsigned nx = starpu_matrix_get_nx(dataA);
 
-	double flop = (1.0f*n*n*n)/3.0f;
+	double flop = (1.0f*nx*nx*nx)/3.0f;
 
 	PRINTF("# size\tms\tGFlops\n");
-	PRINTF("%u\t%.0f\t%.1f\n", n, timing/1000, (flop/timing/1000.0f));
+	PRINTF("%u\t%.0f\t%.1f\n", nx, timing/1000, (flop/timing/1000.0f));
+
+	return 0;
 }
 
 static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
@@ -273,20 +274,23 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
@@ -321,28 +325,28 @@ int main(int argc, char **argv)
 	if (ret) return ret;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
-	for (i = 0; i < size_p; i++)
+	unsigned m,n;
+
+	for (n = 0; n < size_p; n++)
 	{
-		for (j = 0; j < size_p; j++)
+		for (m = 0; m < size_p; m++)
 		{
-			mat[j +i*size_p] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size_p:0.0f);
-			/* mat[j +i*size_p] = ((i == j)?1.0f*size_p:0.0f); */
+			mat[m +n*size_p] = (1.0f/(1.0f+n+m)) + ((n == m)?1.0f*size_p:0.0f);
+			/* mat[m +n*size_p] = ((n == m)?1.0f*size_p:0.0f); */
 		}
 	}
-#endif
 
-
-#ifdef CHECK_OUTPUT
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
@@ -352,54 +356,87 @@ int main(int argc, char **argv)
 		FPRINTF(stdout, "\n");
 	}
 #endif
-
+#endif
 
 	cholesky(mat, size_p, size_p, nblocks_p);
 
-#ifdef CHECK_OUTPUT
+#ifndef STARPU_SIMGRID
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size_p] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
+#endif
 
-	FPRINTF(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size_p*size_p*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
-		     mat, size_p, 0.0f, test_mat, size_p);
-
-	FPRINTF(stderr, "comparing results ...\n");
-	for (j = 0; j < size_p; j++)
+	if (check_p)
 	{
-		for (i = 0; i < size_p; i++)
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (m = 0; m < size_p; m++)
 		{
-			if (i <= j)
+			for (n = 0; n < size_p; n++)
 			{
-				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size_p]);
+				if (n > m)
+				{
+					mat[m+n*size_p] = 0.0f; /* debug */
+				}
 			}
-			else
+		}
+		float *test_mat = malloc(size_p*size_p*sizeof(float));
+		STARPU_ASSERT(test_mat);
+
+		STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
+			     mat, size_p, 0.0f, test_mat, size_p);
+
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
 			{
-				FPRINTF(stdout, ".\t");
+				if (n <= m)
+				{
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size_p]);
+				}
+				else
+				{
+					FPRINTF(stdout, ".\t");
+				}
 			}
+			FPRINTF(stdout, "\n");
 		}
-		FPRINTF(stdout, "\n");
+#endif
+
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
+			{
+				if (n <= m)
+				{
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size_p:0.0f);
+	                                float err = fabsf(test_mat[m +n*size_p] - orig) / orig;
+	                                if (err > 0.0001)
+					{
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size_p], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+		free(test_mat);
 	}
-	free(test_mat);
 #endif
 
 	shutdown_system(&mat, size_p, pinned_p);

+ 55 - 52
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -23,13 +23,16 @@
  * It also directly registers matrix tiles instead of using partitioning.
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
 #include "magma.h"
 #endif
 
-/* A [ y ] [ x ] */
+/* A [ m ] [ n ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
 
@@ -78,19 +81,19 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 	return task;
 }
 
-static int create_task_21(unsigned k, unsigned j)
+static int create_task_21(unsigned k, unsigned m)
 {
 	int ret;
 
-	struct starpu_task *task = create_task(TAG21(k, j));
+	struct starpu_task *task = create_task(TAG21(m, k));
 
 	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
 	task->handles[0] = A_state[k][k];
-	task->handles[1] = A_state[j][k];
+	task->handles[1] = A_state[m][k];
 
-	if (j == k+1)
+	if (m == k+1)
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -98,11 +101,11 @@ static int create_task_21(unsigned k, unsigned j)
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+		starpu_tag_declare_deps(TAG21(m, k), 2, TAG11(k), TAG22(k-1, m, k));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+		starpu_tag_declare_deps(TAG21(m, k), 1, TAG11(k));
 	}
 
 	int n = starpu_matrix_get_nx(task->handles[0]);
@@ -113,22 +116,22 @@ static int create_task_21(unsigned k, unsigned j)
 	return ret;
 }
 
-static int create_task_22(unsigned k, unsigned i, unsigned j)
+static int create_task_22(unsigned k, unsigned m, unsigned n)
 {
 	int ret;
 
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\n", k,m,n, TAG22(k,m,n)); */
 
-	struct starpu_task *task = create_task(TAG22(k, i, j));
+	struct starpu_task *task = create_task(TAG22(k, m, n));
 
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->handles[0] = A_state[i][k];
-	task->handles[1] = A_state[j][k];
-	task->handles[2] = A_state[j][i];
+	task->handles[0] = A_state[n][k];
+	task->handles[1] = A_state[m][k];
+	task->handles[2] = A_state[m][n];
 
-	if ( (i == k + 1) && (j == k +1) )
+	if (!noprio_p && (n == k + 1) && (m == k +1) )
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -136,15 +139,15 @@ static int create_task_22(unsigned k, unsigned i, unsigned j)
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 3, TAG22(k-1, m, n), TAG21(n, k), TAG21(m, k));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 2, TAG21(n, k), TAG21(m, k));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -166,7 +169,7 @@ static int cholesky_no_stride(void)
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 	for (k = 0; k < nblocks_p; k++)
 	{
@@ -183,17 +186,17 @@ static int cholesky_no_stride(void)
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 
-		for (j = k+1; j<nblocks_p; j++)
+		for (m = k+1; m<nblocks_p; m++)
 		{
-			ret = create_task_21(k, j);
+			ret = create_task_21(k, m);
 			if (ret == -ENODEV) return 77;
 
-			for (i = k+1; i<nblocks_p; i++)
+			for (n = k+1; n<nblocks_p; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-				     ret = create_task_22(k, i, j);
-				     if (ret == -ENODEV) return 77;
+					ret = create_task_22(k, m, n);
+					if (ret == -ENODEV) return 77;
 				}
 			}
 		}
@@ -222,7 +225,7 @@ static int cholesky_no_stride(void)
 
 int main(int argc, char **argv)
 {
-	unsigned x, y;
+	unsigned n, m;
 	int ret;
 
 #ifdef STARPU_HAVE_MAGMA
@@ -256,13 +259,13 @@ int main(int argc, char **argv)
 
 	starpu_cublas_init();
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
-		if (x <= y)
+		if (n <= m)
 		{
-			starpu_malloc_flags((void **)&A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-			assert(A[y][x]);
+			starpu_malloc_flags((void **)&A[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+			assert(A[m][n]);
 		}
 	}
 
@@ -271,44 +274,44 @@ int main(int argc, char **argv)
 	 *
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable )
 	 * */
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
-	if (x <= y)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
+	if (n <= m)
 	{
-		unsigned i, j;
-		for (i = 0; i < BLOCKSIZE; i++)
-		for (j = 0; j < BLOCKSIZE; j++)
+		unsigned mm, nn;
+		for (mm = 0; mm < BLOCKSIZE; mm++)
+		for (nn = 0; nn < BLOCKSIZE; nn++)
 		{
-			A[y][x][i*BLOCKSIZE + j] =
-				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
+			A[m][n][mm*BLOCKSIZE + nn] =
+				(float)(1.0f/((float) (1.0+(n*BLOCKSIZE+mm)+(m*BLOCKSIZE+nn))));
 
 			/* make it a little more numerically stable ... ;) */
-			if ((x == y) && (i == j))
-				A[y][x][i*BLOCKSIZE + j] += (float)(2*size_p);
+			if ((n == m) && (mm == nn))
+				A[m][n][mm*BLOCKSIZE + nn] += (float)(2*size_p);
 		}
 	}
 #endif
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
-		if (x <= y)
+		if (n <= m)
 		{
-			starpu_matrix_data_register(&A_state[y][x], STARPU_MAIN_RAM, (uintptr_t)A[y][x],
+			starpu_matrix_data_register(&A_state[m][n], STARPU_MAIN_RAM, (uintptr_t)A[m][n],
 						    BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
-			starpu_data_set_coordinates(A_state[y][x], 2, x, y);
+			starpu_data_set_coordinates(A_state[m][n], 2, n, m);
 		}
 	}
 
 	ret = cholesky_no_stride();
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
-		if (x <= y)
+		if (n <= m)
 		{
-			starpu_data_unregister(A_state[y][x]);
-			starpu_free_flags(A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+			starpu_data_unregister(A_state[m][n]);
+			starpu_free_flags(A[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 		}
 	}
 

+ 5 - 1
include/starpu_data_filters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012,2014,2015,2017,2019            Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2017,2019-2020       Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  * Copyright (C) 2011                                     Inria
@@ -344,6 +344,8 @@ void starpu_csr_filter_vertical_block(void *father_interface, void *child_interf
    Predefined partitioning functions for matrix
    data. Examples on how to use them are shown in \ref
    PartitioningData.
+   Note: this is using the C element order which is row-major, i.e. elements
+   with consecutive x coordinates are consecutive in memory.
    @{
 */
 
@@ -450,6 +452,8 @@ void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interf
    Predefined partitioning functions for block data. Examples on how
    to use them are shown in \ref PartitioningData. An example is
    available in \c examples/filters/shadow3d.c
+   Note: this is using the C element order which is row-major, i.e. elements
+   with consecutive x coordinates are consecutive in memory.
    @{
 */
 

+ 53 - 53
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -79,24 +79,24 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	double start;
 	double end;
 	starpu_data_handle_t **data_handles;
-	unsigned x,y,i,j,k;
+	unsigned k, m, n;
 
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
 	/* create all the DAG nodes */
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
-	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
+	for(m=0 ; m<nblocks ; m++) data_handles[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
-	for(x = 0; x < nblocks ; x++)
+	for (m = 0; m < nblocks; m++)
 	{
-		for (y = 0; y < nblocks; y++)
+		for(n = 0; n < nblocks ; n++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (mpi_rank == rank)
 			{
-				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-				starpu_matrix_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)matA[x][y],
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
+				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
 #ifdef STARPU_DEVEL
@@ -105,14 +105,14 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			else
 			{
 				/* I don't own this index, but will need it for my computations */
-				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, n, m);
+				starpu_matrix_data_register(&data_handles[m][n], -1, (uintptr_t)NULL,
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
-			if (data_handles[x][y])
+			if (data_handles[m][n])
 			{
-				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
-				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
+				starpu_data_set_coordinates(data_handles[m][n], 2, n, m);
+				starpu_mpi_data_register(data_handles[m][n], (m*nblocks)+n, mpi_rank);
 			}
 		}
 	}
@@ -129,34 +129,34 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 				       STARPU_RW, data_handles[k][k],
 				       0);
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
 			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_R, data_handles[k][k],
-					       STARPU_RW, data_handles[k][j],
+					       STARPU_RW, data_handles[m][k],
 					       0);
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
 			if (my_distrib(k, k, nodes) == rank)
 				starpu_data_wont_use(data_handles[k][k]);
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[k][i],
-							       STARPU_R, data_handles[k][j],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[i][j],
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[n][k],
+							       STARPU_R, data_handles[m][k],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 							       0);
 				}
 			}
 
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
-			if (my_distrib(k, j, nodes) == rank)
-				starpu_data_wont_use(data_handles[k][j]);
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+			if (my_distrib(m, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[m][k]);
 		}
 		starpu_iteration_pop();
 	}
@@ -166,18 +166,18 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	end = starpu_timing_now();
 
-	for(x = 0; x < nblocks ; x++)
+	for (m = 0; m < nblocks; m++)
 	{
-		for (y = 0; y < nblocks; y++)
+		for(n = 0; n < nblocks ; n++)
 		{
 			/* Get back data on node 0 for the check */
 			if (check)
-				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[x][y], 0);
+				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
 
-			if (data_handles[x][y])
-				starpu_data_unregister(data_handles[x][y]);
+			if (data_handles[m][n])
+				starpu_data_unregister(data_handles[m][n]);
 		}
-		free(data_handles[x]);
+		free(data_handles[m]);
 	}
 	free(data_handles);
 
@@ -190,31 +190,31 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
 void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops, double epsilon)
 {
-	unsigned i,j,x,y;
+	unsigned nn,mm,n,m;
 	float *rmat = malloc(size*size*sizeof(float));
 
-	for(x=0 ; x<nblocks ; x++)
+	for(n=0 ; n<nblocks ; n++)
 	{
-		for(y=0 ; y<nblocks ; y++)
+		for(m=0 ; m<nblocks ; m++)
 		{
-			for (i = 0; i < BLOCKSIZE; i++)
+			for (nn = 0; nn < BLOCKSIZE; nn++)
 			{
-				for (j = 0; j < BLOCKSIZE; j++)
+				for (mm = 0; mm < BLOCKSIZE; mm++)
 				{
-					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = matA[x][y][j +i*BLOCKSIZE];
+					rmat[mm+(m*BLOCKSIZE)+(nn+(n*BLOCKSIZE))*size] = matA[m][n][mm +nn*BLOCKSIZE];
 				}
 			}
 		}
 	}
 
 	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
-	for (j = 0; j < size; j++)
+	for (mm = 0; mm < size; mm++)
 	{
-		for (i = 0; i < size; i++)
+		for (nn = 0; nn < size; nn++)
 		{
-			if (i > j)
+			if (nn > mm)
 			{
-				rmat[j+i*size] = 0.0f; // debug
+				rmat[mm+nn*size] = 0.0f; // debug
 			}
 		}
 	}
@@ -227,13 +227,13 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
 	if (display)
 	{
-		for (j = 0; j < size; j++)
+		for (mm = 0; mm < size; mm++)
 		{
-			for (i = 0; i < size; i++)
+			for (nn = 0; nn < size; nn++)
 			{
-				if (i <= j)
+				if (nn <= mm)
 				{
-					printf("%2.2f\t", test_mat[j +i*size]);
+					printf("%2.2f\t", test_mat[mm +nn*size]);
 				}
 				else
 				{
@@ -245,24 +245,24 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	}
 
 	*correctness = 1;
-	for(x = 0; x < nblocks ; x++)
+	for(n = 0; n < nblocks ; n++)
 	{
-		for (y = 0; y < nblocks; y++)
+		for (m = 0; m < nblocks; m++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (mpi_rank == rank)
 			{
-				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
 				{
-					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
 					{
-						if (i <= j)
+						if (nn <= mm)
 						{
-							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-							float err = fabsf(test_mat[j +i*size] - orig) / orig;
+							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
+							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
 							if (err > epsilon)
 							{
-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
 								*correctness = 0;
 								*flops = 0;
 								break;

+ 21 - 18
mpi/examples/matrix_decomposition/mpi_decomposition_matrix.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013,2015-2017                      CNRS
- * Copyright (C) 2009-2012,2014,2015                      Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2020                 Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,7 +19,7 @@
 #include "mpi_cholesky.h"
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes)
+int my_distrib(int y, int x, int nb_nodes)
 {
 	(void)nb_nodes;
 	//return (x+y) % nb_nodes;
@@ -62,27 +62,30 @@ void matrix_display(float ***bmat, int rank)
 	}
 }
 
+/* Note: bmat is indexed by bmat[m][n][mm+nn*BLOCKSIZE],
+ * i.e. the content of the tiles is column-major, but the array of tiles is
+ * row-major to keep the m,n notation everywhere */
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 {
-	unsigned i,j,x,y;
+	unsigned nn,mm,m,n;
 
 	*bmat = malloc(nblocks * sizeof(float **));
-	for(x=0 ; x<nblocks ; x++)
+	for(m=0 ; m<nblocks ; m++)
 	{
-		(*bmat)[x] = malloc(nblocks * sizeof(float *));
-		for(y=0 ; y<nblocks ; y++)
+		(*bmat)[m] = malloc(nblocks * sizeof(float *));
+		for(n=0 ; n<nblocks ; n++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (alloc_everywhere || (mpi_rank == rank))
 			{
-				starpu_malloc((void **)&(*bmat)[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
-				for (i = 0; i < BLOCKSIZE; i++)
+				starpu_malloc((void **)&(*bmat)[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+				for (nn = 0; nn < BLOCKSIZE; nn++)
 				{
-					for (j = 0; j < BLOCKSIZE; j++)
+					for (mm = 0; mm < BLOCKSIZE; mm++)
 					{
 #ifndef STARPU_SIMGRID
-						(*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
-						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+						(*bmat)[m][n][mm +nn*BLOCKSIZE] = (1.0f/(1.0f+(nn+(m*BLOCKSIZE)+mm+(n*BLOCKSIZE)))) + ((nn+(m*BLOCKSIZE) == mm+(n*BLOCKSIZE))?1.0f*size:0.0f);
+						//mat[mm +nn*size] = ((nn == mm)?1.0f*size:0.0f);
 #endif
 					}
 				}
@@ -93,19 +96,19 @@ void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 
 void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere)
 {
-	unsigned x, y;
+	unsigned m, n;
 
-	for(x=0 ; x<nblocks ; x++)
+	for(m=0 ; m<nblocks ; m++)
 	{
-		for(y=0 ; y<nblocks ; y++)
+		for(n=0 ; n<nblocks ; n++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (alloc_everywhere || (mpi_rank == rank))
 			{
-				starpu_free((void *)(*bmat)[x][y]);
+				starpu_free((void *)(*bmat)[m][n]);
 			}
 		}
-		free((*bmat)[x]);
+		free((*bmat)[m]);
 	}
 	free(*bmat);
 }

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_matrix.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013,2015,2017                      CNRS
- * Copyright (C) 2009-2012,2014                           Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2020                      Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,7 +20,7 @@
 #define __MPI_CHOLESKY_MATRIX_H__
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes);
+int my_distrib(int y, int x, int nb_nodes);
 
 void matrix_display(float ***bmat, int rank);
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere);