Parcourir la source

Use a coherent m,n ordering in all cholesky codes.

Avoid i,j which is ambiguous.
Samuel Thibault il y a 5 ans
Parent
commit
49da257213

+ 45 - 40
examples/cholesky/cholesky_compil.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2013,2015                           Inria
  * Copyright (C) 2011-2013,2015                           Inria
- * Copyright (C) 2009-2017,2019                           Université de Bordeaux
+ * Copyright (C) 2009-2017,2019-2020                           Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
@@ -24,6 +24,9 @@
  * compiler-side optimizations.
  * compiler-side optimizations.
  */
  */
 
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 #include <math.h>
 #include <math.h>
@@ -50,8 +53,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 
 	unsigned long nelems = starpu_matrix_get_nx(dataA);
 	unsigned long nelems = starpu_matrix_get_nx(dataA);
 	unsigned long nn = nelems/nblocks;
 	unsigned long nn = nelems/nblocks;
-	int N = nblocks;
 	int M = nblocks;
 	int M = nblocks;
+	int N = nblocks;
 
 
 	int lambda_b = starpu_get_env_float_default("CHOLESKY_LAMBDA_B", nblocks);
 	int lambda_b = starpu_get_env_float_default("CHOLESKY_LAMBDA_B", nblocks);
 	int lambda_o_u = starpu_get_env_float_default("CHOLESKY_LAMBDA_O_U", 0);
 	int lambda_o_u = starpu_get_env_float_default("CHOLESKY_LAMBDA_O_U", 0);
@@ -70,7 +73,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 #define ceild(n,d)  ceil(((double)(n))/((double)(d)))
 #define ceild(n,d)  ceil(((double)(n))/((double)(d)))
 #define floord(n,d) floor(((double)(n))/((double)(d)))
 #define floord(n,d) floor(((double)(n))/((double)(d)))
 
 
-#define A(i,j) starpu_data_get_sub_data(dataA, 2, j, i)
+#define A(i,j) starpu_data_get_sub_data(dataA, 2, i, j)
 
 
 #define _POTRF(cl, A, prio, name) do { \
 #define _POTRF(cl, A, prio, name) do { \
 		int ret = starpu_task_insert(cl, \
 		int ret = starpu_task_insert(cl, \
@@ -204,31 +207,34 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
 {
 	starpu_data_handle_t dataA;
 	starpu_data_handle_t dataA;
-	unsigned x, y;
+	unsigned m, n;
 
 
 	/* monitor and partition the A matrix into blocks :
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
 	{
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	struct starpu_data_filter f2 =
 	{
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 
-	for (x = 0; x < nblocks; x++)
-		for (y = 0; y < nblocks; y++)
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks; n++)
 		{
 		{
-			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, x, y);
-			starpu_data_set_coordinates(data, 2, x, y);
+			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, m, n);
+			starpu_data_set_coordinates(data, 2, m, n);
 		}
 		}
 
 
 	int ret = _cholesky(dataA, nblocks);
 	int ret = _cholesky(dataA, nblocks);
@@ -244,14 +250,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	float *mat = NULL;
 	float *mat = NULL;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
+	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-	for (i = 0; i < size; i++)
+	for (n = 0; n < size; n++)
 	{
 	{
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+			mat[m +n*size] = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+			/* mat[m +n*size] = ((m == n)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
 
 
@@ -259,13 +265,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 	FPRINTF(stdout, "Input :\n");
 
 
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			}
 			else
 			else
 			{
 			{
@@ -282,18 +288,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 	FPRINTF(stdout, "Results :\n");
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			}
 			else
 			else
 			{
 			{
 				FPRINTF(stdout, ".\t");
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size] = 0.0f; /* debug */
 			}
 			}
 		}
 		}
 		FPRINTF(stdout, "\n");
 		FPRINTF(stdout, "\n");
@@ -303,13 +308,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	if (check_p)
 	if (check_p)
 	{
 	{
 		FPRINTF(stderr, "compute explicit LLt ...\n");
 		FPRINTF(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
 			{
-				if (i > j)
+				if (n > m)
 				{
 				{
-					mat[j+i*size] = 0.0f; /* debug */
+					mat[m+n*size] = 0.0f; /* debug */
 				}
 				}
 			}
 			}
 		}
 		}
@@ -321,13 +326,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 
 		FPRINTF(stderr, "comparing results ...\n");
 		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 				{
-					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size]);
 				}
 				}
 				else
 				else
 				{
 				{
@@ -338,17 +343,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		}
 		}
 #endif
 #endif
 
 
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 				{
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = fabsf(test_mat[j +i*size] - orig) / orig;
-	                                if (err > 0.00001)
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+	                                float err = fabsf(test_mat[m +n*size] - orig) / orig;
+	                                if (err > 0.0001)
 					{
 					{
-	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size], orig, err);
 	                                        assert(0);
 	                                        assert(0);
 	                                }
 	                                }
 	                        }
 	                        }

+ 115 - 73
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -28,6 +28,9 @@
  * remainder of the matrix with a smaller granularity.
  * remainder of the matrix with a smaller granularity.
  */
  */
 
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "cholesky.h"
 
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
@@ -64,7 +67,8 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
 
 	/* this is an important task */
 	/* this is an important task */
-	task->priority = STARPU_MAX_PRIO;
+	if (!noprio_p)
+		task->priority = STARPU_MAX_PRIO;
 
 
 	/* enforce dependencies ... */
 	/* enforce dependencies ... */
 	if (k > 0)
 	if (k > 0)
@@ -78,19 +82,19 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 	return task;
 }
 }
 
 
-static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
+static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned reclevel)
 {
 {
 	int ret;
 	int ret;
 
 
-	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
+	struct starpu_task *task = create_task(TAG21_AUX(k, m, reclevel));
 
 
 	task->cl = &cl21;
 	task->cl = &cl21;
 
 
 	/* which sub-data is manipulated ? */
 	/* which sub-data is manipulated ? */
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
 
 
-	if (j == k+1)
+	if (!noprio_p && (m == k+1))
 	{
 	{
 		task->priority = STARPU_MAX_PRIO;
 		task->priority = STARPU_MAX_PRIO;
 	}
 	}
@@ -98,37 +102,37 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 	/* enforce dependencies ... */
 	/* enforce dependencies ... */
 	if (k > 0)
 	if (k > 0)
 	{
 	{
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
+		starpu_tag_declare_deps(TAG21_AUX(k, m, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, m, k, reclevel));
 	}
 	}
 	else
 	else
 	{
 	{
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
+		starpu_tag_declare_deps(TAG21_AUX(k, m, reclevel), 1, TAG11_AUX(k, reclevel));
 	}
 	}
 
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_STRSM(n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(nx, nx);
 
 
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
 	return ret;
 }
 }
 
 
-static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned n, unsigned reclevel)
 {
 {
 	int ret;
 	int ret;
 
 
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\nx", k,m,n, TAG22_AUX(k,m,n)); */
 
 
-	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
+	struct starpu_task *task = create_task(TAG22_AUX(k, m, n, reclevel));
 
 
 	task->cl = &cl22;
 	task->cl = &cl22;
 
 
 	/* which sub-data is manipulated ? */
 	/* which sub-data is manipulated ? */
-	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
-	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, n, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, m, n);
 
 
-	if ( (i == k + 1) && (j == k +1) )
+	if ( (n == k + 1) && (m == k +1) )
 	{
 	{
 		task->priority = STARPU_MAX_PRIO;
 		task->priority = STARPU_MAX_PRIO;
 	}
 	}
@@ -136,15 +140,15 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 	/* enforce dependencies ... */
 	/* enforce dependencies ... */
 	if (k > 0)
 	if (k > 0)
 	{
 	{
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+		starpu_tag_declare_deps(TAG22_AUX(k, m, n, reclevel), 3, TAG22_AUX(k-1, m, n, reclevel), TAG21_AUX(k, n, reclevel), TAG21_AUX(k, m, reclevel));
 	}
 	}
 	else
 	else
 	{
 	{
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+		starpu_tag_declare_deps(TAG22_AUX(k, m, n, reclevel), 2, TAG21_AUX(k, n, reclevel), TAG21_AUX(k, m, reclevel));
 	}
 	}
 
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
 
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -166,7 +170,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	struct starpu_task *entry_task = NULL;
 	struct starpu_task *entry_task = NULL;
 
 
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 
 	starpu_data_handle_t dataA;
 	starpu_data_handle_t dataA;
 
 
@@ -176,15 +180,18 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
 	{
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	struct starpu_data_filter f2 =
 	{
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
@@ -206,16 +213,16 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 		}
 
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
 		{
-		     	ret = create_task_21(dataA, k, j, reclevel);
+		     	ret = create_task_21(dataA, k, m, reclevel);
 			if (ret == -ENODEV) return 77;
 			if (ret == -ENODEV) return 77;
 
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 				{
-				     ret = create_task_22(dataA, k, i, j, reclevel);
+				     ret = create_task_22(dataA, k, m, n, reclevel);
 				     if (ret == -ENODEV) return 77;
 				     if (ret == -ENODEV) return 77;
 				}
 				}
 			}
 			}
@@ -248,11 +255,11 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 		STARPU_ASSERT(tag_array);
 		STARPU_ASSERT(tag_array);
 
 
 		unsigned ind = 0;
 		unsigned ind = 0;
-		for (i = nbigblocks; i < nblocks; i++)
-		for (j = nbigblocks; j < nblocks; j++)
+		for (n = nbigblocks; n < nblocks; n++)
+		for (m = nbigblocks; m < nblocks; m++)
 		{
 		{
-			if (i <= j)
-				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
+			if (n <= m)
+				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, m, n, reclevel);
 		}
 		}
 
 
 		starpu_tag_wait_array(ind, tag_array);
 		starpu_tag_wait_array(ind, tag_array);
@@ -268,7 +275,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	}
 	}
 }
 }
 
 
-static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
+static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
 {
 {
 	int ret;
 	int ret;
 	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
 	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
@@ -279,7 +286,7 @@ static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 	if (ret == -ENODEV)
-		exit(77);
+		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	init_sizes();
 	init_sizes();
@@ -301,6 +308,8 @@ static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
 	if (pinned)
 	if (pinned)
 		flags |= STARPU_MALLOC_PINNED;
 		flags |= STARPU_MALLOC_PINNED;
 	starpu_malloc_flags((void **)A, size_p*size_p*sizeof(float), flags);
 	starpu_malloc_flags((void **)A, size_p*size_p*sizeof(float), flags);
+
+	return 0;
 }
 }
 
 
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
@@ -344,34 +353,33 @@ int main(int argc, char **argv)
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
 	 * */
 	 * */
 
 
-     	int ret;
-
 	float *mat = NULL;
 	float *mat = NULL;
-	initialize_system(argc, argv, &mat, pinned_p);
+	int ret = initialize_system(argc, argv, &mat, pinned_p);
+	if (ret) return ret;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
-	for (i = 0; i < size_p; i++)
+	unsigned m,n;
+
+	for (n = 0; n < size_p; n++)
 	{
 	{
-		for (j = 0; j < size_p; j++)
+		for (m = 0; m < size_p; m++)
 		{
 		{
-			mat[j +i*size_p] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size_p:0.0f);
-			/* mat[j +i*size_p] = ((i == j)?1.0f*size_p:0.0f); */
+			mat[m +n*size_p] = (1.0f/(1.0f+n+m)) + ((n == m)?1.0f*size_p:0.0f);
+			/* mat[m +n*size_p] = ((n == m)?1.0f*size_p:0.0f); */
 		}
 		}
 	}
 	}
-#endif
 
 
-
-#ifdef CHECK_OUTPUT
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 	FPRINTF(stdout, "Input :\n");
 
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			}
 			else
 			else
 			{
 			{
@@ -381,55 +389,89 @@ int main(int argc, char **argv)
 		FPRINTF(stdout, "\n");
 		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
+#endif
 
 
 	ret = cholesky_grain(mat, size_p, size_p, nblocks_p, nbigblocks_p);
 	ret = cholesky_grain(mat, size_p, size_p, nblocks_p, nbigblocks_p);
 
 
-#ifdef CHECK_OUTPUT
+#ifndef STARPU_SIMGRID
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 	FPRINTF(stdout, "Results :\n");
 
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			}
 			else
 			else
 			{
 			{
 				FPRINTF(stdout, ".\t");
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size_p] = 0.0f; /* debug */
 			}
 			}
 		}
 		}
 		FPRINTF(stdout, "\n");
 		FPRINTF(stdout, "\n");
 	}
 	}
+#endif
 
 
-	FPRINTF(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size_p*size_p*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
-		     mat, size_p, 0.0f, test_mat, size_p);
-
-	FPRINTF(stderr, "comparing results ...\n");
-	for (j = 0; j < size_p; j++)
+	if (check_p)
 	{
 	{
-		for (i = 0; i < size_p; i++)
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (m = 0; m < size_p; m++)
 		{
 		{
-			if (i <= j)
+			for (n = 0; n < size_p; n++)
 			{
 			{
-                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size_p]);
+				if (n > m)
+				{
+					mat[m+n*size_p] = 0.0f; /* debug */
+				}
 			}
 			}
-			else
+		}
+		float *test_mat = malloc(size_p*size_p*sizeof(float));
+		STARPU_ASSERT(test_mat);
+
+		STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
+			     mat, size_p, 0.0f, test_mat, size_p);
+
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
 			{
 			{
-				FPRINTF(stdout, ".\t");
+				if (n <= m)
+				{
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size_p]);
+				}
+				else
+				{
+					FPRINTF(stdout, ".\t");
+				}
 			}
 			}
+			FPRINTF(stdout, "\n");
 		}
 		}
-		FPRINTF(stdout, "\n");
+#endif
+
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
+			{
+				if (n <= m)
+				{
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size_p:0.0f);
+	                                float err = fabsf(test_mat[m +n*size_p] - orig) / orig;
+	                                if (err > 0.0001)
+					{
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size_p], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+		free(test_mat);
 	}
 	}
-	free(test_mat);
 #endif
 #endif
 
 
 	shutdown_system(&mat, size_p, pinned_p);
 	shutdown_system(&mat, size_p, pinned_p);
-	return ret;
+	return 0;
 }
 }

+ 63 - 58
examples/cholesky/cholesky_implicit.c

@@ -23,6 +23,9 @@
  * The whole algorithm thus appears clearly in the task submission loop in _cholesky().
  * The whole algorithm thus appears clearly in the task submission loop in _cholesky().
  */
  */
 
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 
 
@@ -46,9 +49,9 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	double start;
 	double start;
 	double end;
 	double end;
 
 
-	unsigned i,j,k;
-	unsigned long n = starpu_matrix_get_nx(dataA);
-	unsigned long nn = n/nblocks;
+	unsigned k,m,n;
+	unsigned long nx = starpu_matrix_get_nx(dataA);
+	unsigned long nn = nx/nblocks;
 
 
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
 
@@ -75,45 +78,45 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		if (ret == -ENODEV) return 77;
 		if (ret == -ENODEV) return 77;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
 		{
-                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
+                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
 
 
                         ret = starpu_task_insert(&cl21,
                         ret = starpu_task_insert(&cl21,
-						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
 						 STARPU_R, sdatakk,
-						 STARPU_RW, sdatakj,
+						 STARPU_RW, sdatamk,
 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
-						 STARPU_TAG_ONLY, TAG21(k,j),
+						 STARPU_TAG_ONLY, TAG21(m,k),
 						 0);
 						 0);
 			if (ret == -ENODEV) return 77;
 			if (ret == -ENODEV) return 77;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 		}
 		starpu_data_wont_use(sdatakk);
 		starpu_data_wont_use(sdatakk);
 
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
 		{
-                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
-			for (i = k+1; i<nblocks; i++)
+                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
+			for (n = k+1; n<nblocks; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
                                 {
                                 {
-					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
-					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
+					starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
+					starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
 
 
 					ret = starpu_task_insert(&cl22,
 					ret = starpu_task_insert(&cl22,
-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-								 STARPU_R, sdataki,
-								 STARPU_R, sdatakj,
-								 cl22.modes[2], sdataij,
+								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+								 STARPU_R, sdatamk,
+								 STARPU_R, sdatank,
+								 cl22.modes[2], sdatamn,
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
-								 STARPU_TAG_ONLY, TAG22(k,i,j),
+								 STARPU_TAG_ONLY, TAG22(k,m,n),
 								 0);
 								 0);
 					if (ret == -ENODEV) return 77;
 					if (ret == -ENODEV) return 77;
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
                                 }
                                 }
 			}
 			}
-			starpu_data_wont_use(sdatakj);
+			starpu_data_wont_use(sdatamk);
 		}
 		}
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}
@@ -128,7 +131,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 
 	double timing = end - start;
 	double timing = end - start;
 
 
-	double flop = FLOPS_SPOTRF(n);
+	double flop = FLOPS_SPOTRF(nx);
 
 
 	if(with_ctxs_p || with_noctxs_p || chole1_p || chole2_p)
 	if(with_ctxs_p || with_noctxs_p || chole1_p || chole2_p)
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
@@ -139,7 +142,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			PRINTF("\tTms\tTGFlops");
 			PRINTF("\tTms\tTGFlops");
 		PRINTF("\n");
 		PRINTF("\n");
 
 
-		PRINTF("%lu\t%.0f\t%.1f", n, timing/1000, (flop/timing/1000.0f));
+		PRINTF("%lu\t%.0f\t%.1f", nx, timing/1000, (flop/timing/1000.0f));
 		if (bound_lp_p)
 		if (bound_lp_p)
 		{
 		{
 			FILE *f = fopen("cholesky.lp", "w");
 			FILE *f = fopen("cholesky.lp", "w");
@@ -166,31 +169,34 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
 {
 	starpu_data_handle_t dataA;
 	starpu_data_handle_t dataA;
-	unsigned x, y;
+	unsigned m, n;
 
 
 	/* monitor and partition the A matrix into blocks :
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
 	{
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	struct starpu_data_filter f2 =
 	{
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 
-	for (x = 0; x < nblocks; x++)
-		for (y = 0; y < nblocks; y++)
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks; n++)
 		{
 		{
-			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, x, y);
-			starpu_data_set_coordinates(data, 2, x, y);
+			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, m, n);
+			starpu_data_set_coordinates(data, 2, m, n);
 		}
 		}
 
 
 	int ret = _cholesky(dataA, nblocks);
 	int ret = _cholesky(dataA, nblocks);
@@ -206,14 +212,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	float *mat = NULL;
 	float *mat = NULL;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
+	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-	for (i = 0; i < size; i++)
+	for (n = 0; n < size; n++)
 	{
 	{
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+			mat[m +n*size] = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+			/* mat[m +n*size] = ((m == n)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
 
 
@@ -221,13 +227,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 	FPRINTF(stdout, "Input :\n");
 
 
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			}
 			else
 			else
 			{
 			{
@@ -244,18 +250,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 	FPRINTF(stdout, "Results :\n");
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			}
 			else
 			else
 			{
 			{
 				FPRINTF(stdout, ".\t");
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size] = 0.0f; /* debug */
 			}
 			}
 		}
 		}
 		FPRINTF(stdout, "\n");
 		FPRINTF(stdout, "\n");
@@ -265,13 +270,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	if (check_p)
 	if (check_p)
 	{
 	{
 		FPRINTF(stderr, "compute explicit LLt ...\n");
 		FPRINTF(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
 			{
-				if (i > j)
+				if (n > m)
 				{
 				{
-					mat[j+i*size] = 0.0f; /* debug */
+					mat[m+n*size] = 0.0f; /* debug */
 				}
 				}
 			}
 			}
 		}
 		}
@@ -283,13 +288,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 
 		FPRINTF(stderr, "comparing results ...\n");
 		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 				{
-					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size]);
 				}
 				}
 				else
 				else
 				{
 				{
@@ -300,17 +305,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		}
 		}
 #endif
 #endif
 
 
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 				{
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = fabsf(test_mat[j +i*size] - orig) / orig;
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+	                                float err = fabsf(test_mat[m +n*size] - orig) / orig;
 	                                if (err > 0.0001)
 	                                if (err > 0.0001)
 					{
 					{
-	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size], orig, err);
 	                                        assert(0);
 	                                        assert(0);
 	                                }
 	                                }
 	                        }
 	                        }

+ 127 - 90
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -24,6 +24,9 @@
  * It also uses data partitioning to split the matrix into submatrices
  * It also uses data partitioning to split the matrix into submatrices
  */
  */
 
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "cholesky.h"
 #include <starpu_perfmodel.h>
 #include <starpu_perfmodel.h>
 
 
@@ -76,17 +79,19 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 	return task;
 }
 }
 
 
-static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned m)
 {
 {
-	struct starpu_task *task = create_task(TAG21(k, j));
+	int ret;
+
+	struct starpu_task *task = create_task(TAG21(k, m));
 
 
 	task->cl = &cl21;
 	task->cl = &cl21;
 
 
 	/* which sub-data is manipulated ? */
 	/* which sub-data is manipulated ? */
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
 
 
-	if (!noprio_p && (j == k+1))
+	if (!noprio_p && (m == k+1))
 	{
 	{
 		task->priority = STARPU_MAX_PRIO;
 		task->priority = STARPU_MAX_PRIO;
 	}
 	}
@@ -94,39 +99,37 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 	/* enforce dependencies ... */
 	/* enforce dependencies ... */
 	if (k > 0)
 	if (k > 0)
 	{
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+		starpu_tag_declare_deps(TAG21(k, m), 2, TAG11(k), TAG22(k-1, m, k));
 	}
 	}
 	else
 	else
 	{
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+		starpu_tag_declare_deps(TAG21(k, m), 1, TAG11(k));
 	}
 	}
 
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_STRSM(n, n);
-
-	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(nx, nx);
 
 
+	ret = starpu_task_submit(task);
+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return ret;
 }
 }
 
 
-static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
+static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned n)
 {
 {
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+	int ret;
+
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\n", k,m,n, TAG22(k,m,n)); */
 
 
-	struct starpu_task *task = create_task(TAG22(k, i, j));
+	struct starpu_task *task = create_task(TAG22(k, m, n));
 
 
 	task->cl = &cl22;
 	task->cl = &cl22;
 
 
 	/* which sub-data is manipulated ? */
 	/* which sub-data is manipulated ? */
-	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
-	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, n, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, m, n);
 
 
-	if (!noprio_p && (i == k + 1) && (j == k +1) )
+	if (!noprio_p && (n == k + 1) && (m == k +1) )
 	{
 	{
 		task->priority = STARPU_MAX_PRIO;
 		task->priority = STARPU_MAX_PRIO;
 	}
 	}
@@ -134,26 +137,21 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 	/* enforce dependencies ... */
 	/* enforce dependencies ... */
 	if (k > 0)
 	if (k > 0)
 	{
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 3, TAG22(k-1, m, n), TAG21(k, n), TAG21(k, m));
 	}
 	}
 	else
 	else
 	{
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 2, TAG21(k, n), TAG21(k, m));
 	}
 	}
 
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
 
-	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
+	ret = starpu_task_submit(task);
+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return ret;
 }
 }
 
 
-
-
 /*
 /*
  *	code to bootstrap the factorization
  *	code to bootstrap the factorization
  *	and construct the DAG
  *	and construct the DAG
@@ -161,13 +159,15 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
 
 static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
 {
+	int ret;
+
 	double start;
 	double start;
 	double end;
 	double end;
 
 
 	struct starpu_task *entry_task = NULL;
 	struct starpu_task *entry_task = NULL;
 
 
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 
 	start = starpu_timing_now();
 	start = starpu_timing_now();
 
 
@@ -191,27 +191,27 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 
 		}
 		}
 
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
 		{
-			create_task_21(dataA, k, j);
+			ret = create_task_21(dataA, k, m);
+			if (ret == -ENODEV) return 77;
 
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
 			{
-				if (i <= j)
-					create_task_22(dataA, k, i, j);
+				if (n <= m)
+				{
+					ret = create_task_22(dataA, k, m, n);
+					if (ret == -ENODEV) return 77;
+				}
 			}
 			}
 		}
 		}
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}
 
 
 	/* schedule the codelet */
 	/* schedule the codelet */
-	int ret = starpu_task_submit(entry_task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
-
+	ret = starpu_task_submit(entry_task);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
 	/* stall the application until the end of computations */
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 	starpu_tag_wait(TAG11(nblocks-1));
@@ -220,15 +220,16 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 
 	end = starpu_timing_now();
 	end = starpu_timing_now();
 
 
-
 	double timing = end - start;
 	double timing = end - start;
 
 
-	unsigned n = starpu_matrix_get_nx(dataA);
+	unsigned nx = starpu_matrix_get_nx(dataA);
 
 
-	double flop = (1.0f*n*n*n)/3.0f;
+	double flop = (1.0f*nx*nx*nx)/3.0f;
 
 
 	PRINTF("# size\tms\tGFlops\n");
 	PRINTF("# size\tms\tGFlops\n");
-	PRINTF("%u\t%.0f\t%.1f\n", n, timing/1000, (flop/timing/1000.0f));
+	PRINTF("%u\t%.0f\t%.1f\n", nx, timing/1000, (flop/timing/1000.0f));
+
+	return 0;
 }
 }
 
 
 static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
 static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
@@ -273,20 +274,23 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	starpu_data_handle_t dataA;
 	starpu_data_handle_t dataA;
 
 
 	/* monitor and partition the A matrix into blocks :
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
 	{
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	struct starpu_data_filter f2 =
 	{
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 		.nchildren = nblocks
 	};
 	};
 
 
@@ -321,28 +325,28 @@ int main(int argc, char **argv)
 	if (ret) return ret;
 	if (ret) return ret;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
-	for (i = 0; i < size_p; i++)
+	unsigned m,n;
+
+	for (n = 0; n < size_p; n++)
 	{
 	{
-		for (j = 0; j < size_p; j++)
+		for (m = 0; m < size_p; m++)
 		{
 		{
-			mat[j +i*size_p] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size_p:0.0f);
-			/* mat[j +i*size_p] = ((i == j)?1.0f*size_p:0.0f); */
+			mat[m +n*size_p] = (1.0f/(1.0f+n+m)) + ((n == m)?1.0f*size_p:0.0f);
+			/* mat[m +n*size_p] = ((n == m)?1.0f*size_p:0.0f); */
 		}
 		}
 	}
 	}
-#endif
 
 
-
-#ifdef CHECK_OUTPUT
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 	FPRINTF(stdout, "Input :\n");
 
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			}
 			else
 			else
 			{
 			{
@@ -352,54 +356,87 @@ int main(int argc, char **argv)
 		FPRINTF(stdout, "\n");
 		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
-
+#endif
 
 
 	cholesky(mat, size_p, size_p, nblocks_p);
 	cholesky(mat, size_p, size_p, nblocks_p);
 
 
-#ifdef CHECK_OUTPUT
+#ifndef STARPU_SIMGRID
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 	FPRINTF(stdout, "Results :\n");
 
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
 		{
-			if (i <= j)
+			if (n <= m)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			}
 			else
 			else
 			{
 			{
 				FPRINTF(stdout, ".\t");
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size_p] = 0.0f; /* debug */
 			}
 			}
 		}
 		}
 		FPRINTF(stdout, "\n");
 		FPRINTF(stdout, "\n");
 	}
 	}
+#endif
 
 
-	FPRINTF(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size_p*size_p*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
-		     mat, size_p, 0.0f, test_mat, size_p);
-
-	FPRINTF(stderr, "comparing results ...\n");
-	for (j = 0; j < size_p; j++)
+	if (check_p)
 	{
 	{
-		for (i = 0; i < size_p; i++)
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (m = 0; m < size_p; m++)
 		{
 		{
-			if (i <= j)
+			for (n = 0; n < size_p; n++)
 			{
 			{
-				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size_p]);
+				if (n > m)
+				{
+					mat[m+n*size_p] = 0.0f; /* debug */
+				}
 			}
 			}
-			else
+		}
+		float *test_mat = malloc(size_p*size_p*sizeof(float));
+		STARPU_ASSERT(test_mat);
+
+		STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
+			     mat, size_p, 0.0f, test_mat, size_p);
+
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
 			{
 			{
-				FPRINTF(stdout, ".\t");
+				if (n <= m)
+				{
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size_p]);
+				}
+				else
+				{
+					FPRINTF(stdout, ".\t");
+				}
 			}
 			}
+			FPRINTF(stdout, "\n");
 		}
 		}
-		FPRINTF(stdout, "\n");
+#endif
+
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
+			{
+				if (n <= m)
+				{
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size_p:0.0f);
+	                                float err = fabsf(test_mat[m +n*size_p] - orig) / orig;
+	                                if (err > 0.0001)
+					{
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size_p], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+		free(test_mat);
 	}
 	}
-	free(test_mat);
 #endif
 #endif
 
 
 	shutdown_system(&mat, size_p, pinned_p);
 	shutdown_system(&mat, size_p, pinned_p);

+ 55 - 52
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -23,13 +23,16 @@
  * It also directly registers matrix tiles instead of using partitioning.
  * It also directly registers matrix tiles instead of using partitioning.
  */
  */
 
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "cholesky.h"
 
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
 #include "magma.h"
 #include "magma.h"
 #endif
 #endif
 
 
-/* A [ y ] [ x ] */
+/* A [ m ] [ n ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
 
 
@@ -78,19 +81,19 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 	return task;
 	return task;
 }
 }
 
 
-static int create_task_21(unsigned k, unsigned j)
+static int create_task_21(unsigned k, unsigned m)
 {
 {
 	int ret;
 	int ret;
 
 
-	struct starpu_task *task = create_task(TAG21(k, j));
+	struct starpu_task *task = create_task(TAG21(m, k));
 
 
 	task->cl = &cl21;
 	task->cl = &cl21;
 
 
 	/* which sub-data is manipulated ? */
 	/* which sub-data is manipulated ? */
 	task->handles[0] = A_state[k][k];
 	task->handles[0] = A_state[k][k];
-	task->handles[1] = A_state[j][k];
+	task->handles[1] = A_state[m][k];
 
 
-	if (j == k+1)
+	if (m == k+1)
 	{
 	{
 		task->priority = STARPU_MAX_PRIO;
 		task->priority = STARPU_MAX_PRIO;
 	}
 	}
@@ -98,11 +101,11 @@ static int create_task_21(unsigned k, unsigned j)
 	/* enforce dependencies ... */
 	/* enforce dependencies ... */
 	if (k > 0)
 	if (k > 0)
 	{
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+		starpu_tag_declare_deps(TAG21(m, k), 2, TAG11(k), TAG22(k-1, m, k));
 	}
 	}
 	else
 	else
 	{
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+		starpu_tag_declare_deps(TAG21(m, k), 1, TAG11(k));
 	}
 	}
 
 
 	int n = starpu_matrix_get_nx(task->handles[0]);
 	int n = starpu_matrix_get_nx(task->handles[0]);
@@ -113,22 +116,22 @@ static int create_task_21(unsigned k, unsigned j)
 	return ret;
 	return ret;
 }
 }
 
 
-static int create_task_22(unsigned k, unsigned i, unsigned j)
+static int create_task_22(unsigned k, unsigned m, unsigned n)
 {
 {
 	int ret;
 	int ret;
 
 
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\n", k,m,n, TAG22(k,m,n)); */
 
 
-	struct starpu_task *task = create_task(TAG22(k, i, j));
+	struct starpu_task *task = create_task(TAG22(k, m, n));
 
 
 	task->cl = &cl22;
 	task->cl = &cl22;
 
 
 	/* which sub-data is manipulated ? */
 	/* which sub-data is manipulated ? */
-	task->handles[0] = A_state[i][k];
-	task->handles[1] = A_state[j][k];
-	task->handles[2] = A_state[j][i];
+	task->handles[0] = A_state[n][k];
+	task->handles[1] = A_state[m][k];
+	task->handles[2] = A_state[m][n];
 
 
-	if ( (i == k + 1) && (j == k +1) )
+	if (!noprio_p && (n == k + 1) && (m == k +1) )
 	{
 	{
 		task->priority = STARPU_MAX_PRIO;
 		task->priority = STARPU_MAX_PRIO;
 	}
 	}
@@ -136,15 +139,15 @@ static int create_task_22(unsigned k, unsigned i, unsigned j)
 	/* enforce dependencies ... */
 	/* enforce dependencies ... */
 	if (k > 0)
 	if (k > 0)
 	{
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 3, TAG22(k-1, m, n), TAG21(n, k), TAG21(m, k));
 	}
 	}
 	else
 	else
 	{
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 2, TAG21(n, k), TAG21(m, k));
 	}
 	}
 
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
 
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -166,7 +169,7 @@ static int cholesky_no_stride(void)
 	struct starpu_task *entry_task = NULL;
 	struct starpu_task *entry_task = NULL;
 
 
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 
 	for (k = 0; k < nblocks_p; k++)
 	for (k = 0; k < nblocks_p; k++)
 	{
 	{
@@ -183,17 +186,17 @@ static int cholesky_no_stride(void)
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 		}
 
 
-		for (j = k+1; j<nblocks_p; j++)
+		for (m = k+1; m<nblocks_p; m++)
 		{
 		{
-			ret = create_task_21(k, j);
+			ret = create_task_21(k, m);
 			if (ret == -ENODEV) return 77;
 			if (ret == -ENODEV) return 77;
 
 
-			for (i = k+1; i<nblocks_p; i++)
+			for (n = k+1; n<nblocks_p; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 				{
-				     ret = create_task_22(k, i, j);
-				     if (ret == -ENODEV) return 77;
+					ret = create_task_22(k, m, n);
+					if (ret == -ENODEV) return 77;
 				}
 				}
 			}
 			}
 		}
 		}
@@ -222,7 +225,7 @@ static int cholesky_no_stride(void)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	unsigned x, y;
+	unsigned n, m;
 	int ret;
 	int ret;
 
 
 #ifdef STARPU_HAVE_MAGMA
 #ifdef STARPU_HAVE_MAGMA
@@ -256,13 +259,13 @@ int main(int argc, char **argv)
 
 
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
 	{
-		if (x <= y)
+		if (n <= m)
 		{
 		{
-			starpu_malloc_flags((void **)&A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-			assert(A[y][x]);
+			starpu_malloc_flags((void **)&A[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+			assert(A[m][n]);
 		}
 		}
 	}
 	}
 
 
@@ -271,44 +274,44 @@ int main(int argc, char **argv)
 	 *
 	 *
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable )
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable )
 	 * */
 	 * */
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
-	if (x <= y)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
+	if (n <= m)
 	{
 	{
-		unsigned i, j;
-		for (i = 0; i < BLOCKSIZE; i++)
-		for (j = 0; j < BLOCKSIZE; j++)
+		unsigned mm, nn;
+		for (mm = 0; mm < BLOCKSIZE; mm++)
+		for (nn = 0; nn < BLOCKSIZE; nn++)
 		{
 		{
-			A[y][x][i*BLOCKSIZE + j] =
-				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
+			A[m][n][mm*BLOCKSIZE + nn] =
+				(float)(1.0f/((float) (1.0+(n*BLOCKSIZE+mm)+(m*BLOCKSIZE+nn))));
 
 
 			/* make it a little more numerically stable ... ;) */
 			/* make it a little more numerically stable ... ;) */
-			if ((x == y) && (i == j))
-				A[y][x][i*BLOCKSIZE + j] += (float)(2*size_p);
+			if ((n == m) && (mm == nn))
+				A[m][n][mm*BLOCKSIZE + nn] += (float)(2*size_p);
 		}
 		}
 	}
 	}
 #endif
 #endif
 
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
 	{
-		if (x <= y)
+		if (n <= m)
 		{
 		{
-			starpu_matrix_data_register(&A_state[y][x], STARPU_MAIN_RAM, (uintptr_t)A[y][x],
+			starpu_matrix_data_register(&A_state[m][n], STARPU_MAIN_RAM, (uintptr_t)A[m][n],
 						    BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
 						    BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
-			starpu_data_set_coordinates(A_state[y][x], 2, x, y);
+			starpu_data_set_coordinates(A_state[m][n], 2, n, m);
 		}
 		}
 	}
 	}
 
 
 	ret = cholesky_no_stride();
 	ret = cholesky_no_stride();
 
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
 	{
-		if (x <= y)
+		if (n <= m)
 		{
 		{
-			starpu_data_unregister(A_state[y][x]);
-			starpu_free_flags(A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+			starpu_data_unregister(A_state[m][n]);
+			starpu_free_flags(A[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 		}
 		}
 	}
 	}
 
 

+ 5 - 1
include/starpu_data_filters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012,2014,2015,2017,2019            Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2017,2019-2020       Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  * Copyright (C) 2011                                     Inria
  * Copyright (C) 2011                                     Inria
@@ -344,6 +344,8 @@ void starpu_csr_filter_vertical_block(void *father_interface, void *child_interf
    Predefined partitioning functions for matrix
    Predefined partitioning functions for matrix
    data. Examples on how to use them are shown in \ref
    data. Examples on how to use them are shown in \ref
    PartitioningData.
    PartitioningData.
+   Note: this is using the C element order which is row-major, i.e. elements
+   with consecutive x coordinates are consecutive in memory.
    @{
    @{
 */
 */
 
 
@@ -450,6 +452,8 @@ void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interf
    Predefined partitioning functions for block data. Examples on how
    Predefined partitioning functions for block data. Examples on how
    to use them are shown in \ref PartitioningData. An example is
    to use them are shown in \ref PartitioningData. An example is
    available in \c examples/filters/shadow3d.c
    available in \c examples/filters/shadow3d.c
+   Note: this is using the C element order which is row-major, i.e. elements
+   with consecutive x coordinates are consecutive in memory.
    @{
    @{
 */
 */
 
 

+ 53 - 53
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -79,24 +79,24 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	double start;
 	double start;
 	double end;
 	double end;
 	starpu_data_handle_t **data_handles;
 	starpu_data_handle_t **data_handles;
-	unsigned x,y,i,j,k;
+	unsigned k, m, n;
 
 
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
 
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
 
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
-	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
+	for(m=0 ; m<nblocks ; m++) data_handles[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
 
-	for(x = 0; x < nblocks ; x++)
+	for (m = 0; m < nblocks; m++)
 	{
 	{
-		for (y = 0; y < nblocks; y++)
+		for(n = 0; n < nblocks ; n++)
 		{
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (mpi_rank == rank)
 			if (mpi_rank == rank)
 			{
 			{
-				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-				starpu_matrix_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)matA[x][y],
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
+				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
 						ld, size/nblocks, size/nblocks, sizeof(float));
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
 			}
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
@@ -105,14 +105,14 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			else
 			else
 			{
 			{
 				/* I don't own this index, but will need it for my computations */
 				/* I don't own this index, but will need it for my computations */
-				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, n, m);
+				starpu_matrix_data_register(&data_handles[m][n], -1, (uintptr_t)NULL,
 						ld, size/nblocks, size/nblocks, sizeof(float));
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
 			}
-			if (data_handles[x][y])
+			if (data_handles[m][n])
 			{
 			{
-				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
-				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
+				starpu_data_set_coordinates(data_handles[m][n], 2, n, m);
+				starpu_mpi_data_register(data_handles[m][n], (m*nblocks)+n, mpi_rank);
 			}
 			}
 		}
 		}
 	}
 	}
@@ -129,34 +129,34 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 				       STARPU_RW, data_handles[k][k],
 				       STARPU_RW, data_handles[k][k],
 				       0);
 				       0);
 
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
 		{
 			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
 			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_R, data_handles[k][k],
 					       STARPU_R, data_handles[k][k],
-					       STARPU_RW, data_handles[k][j],
+					       STARPU_RW, data_handles[m][k],
 					       0);
 					       0);
 
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
 			if (my_distrib(k, k, nodes) == rank)
 			if (my_distrib(k, k, nodes) == rank)
 				starpu_data_wont_use(data_handles[k][k]);
 				starpu_data_wont_use(data_handles[k][k]);
 
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 				{
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[k][i],
-							       STARPU_R, data_handles[k][j],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[i][j],
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[n][k],
+							       STARPU_R, data_handles[m][k],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 							       0);
 							       0);
 				}
 				}
 			}
 			}
 
 
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
-			if (my_distrib(k, j, nodes) == rank)
-				starpu_data_wont_use(data_handles[k][j]);
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+			if (my_distrib(m, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[m][k]);
 		}
 		}
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}
@@ -166,18 +166,18 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	end = starpu_timing_now();
 	end = starpu_timing_now();
 
 
-	for(x = 0; x < nblocks ; x++)
+	for (m = 0; m < nblocks; m++)
 	{
 	{
-		for (y = 0; y < nblocks; y++)
+		for(n = 0; n < nblocks ; n++)
 		{
 		{
 			/* Get back data on node 0 for the check */
 			/* Get back data on node 0 for the check */
 			if (check)
 			if (check)
-				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[x][y], 0);
+				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
 
 
-			if (data_handles[x][y])
-				starpu_data_unregister(data_handles[x][y]);
+			if (data_handles[m][n])
+				starpu_data_unregister(data_handles[m][n]);
 		}
 		}
-		free(data_handles[x]);
+		free(data_handles[m]);
 	}
 	}
 	free(data_handles);
 	free(data_handles);
 
 
@@ -190,31 +190,31 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
 
 void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops, double epsilon)
 void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops, double epsilon)
 {
 {
-	unsigned i,j,x,y;
+	unsigned nn,mm,n,m;
 	float *rmat = malloc(size*size*sizeof(float));
 	float *rmat = malloc(size*size*sizeof(float));
 
 
-	for(x=0 ; x<nblocks ; x++)
+	for(n=0 ; n<nblocks ; n++)
 	{
 	{
-		for(y=0 ; y<nblocks ; y++)
+		for(m=0 ; m<nblocks ; m++)
 		{
 		{
-			for (i = 0; i < BLOCKSIZE; i++)
+			for (nn = 0; nn < BLOCKSIZE; nn++)
 			{
 			{
-				for (j = 0; j < BLOCKSIZE; j++)
+				for (mm = 0; mm < BLOCKSIZE; mm++)
 				{
 				{
-					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = matA[x][y][j +i*BLOCKSIZE];
+					rmat[mm+(m*BLOCKSIZE)+(nn+(n*BLOCKSIZE))*size] = matA[m][n][mm +nn*BLOCKSIZE];
 				}
 				}
 			}
 			}
 		}
 		}
 	}
 	}
 
 
 	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
 	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
-	for (j = 0; j < size; j++)
+	for (mm = 0; mm < size; mm++)
 	{
 	{
-		for (i = 0; i < size; i++)
+		for (nn = 0; nn < size; nn++)
 		{
 		{
-			if (i > j)
+			if (nn > mm)
 			{
 			{
-				rmat[j+i*size] = 0.0f; // debug
+				rmat[mm+nn*size] = 0.0f; // debug
 			}
 			}
 		}
 		}
 	}
 	}
@@ -227,13 +227,13 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
 	if (display)
 	if (display)
 	{
 	{
-		for (j = 0; j < size; j++)
+		for (mm = 0; mm < size; mm++)
 		{
 		{
-			for (i = 0; i < size; i++)
+			for (nn = 0; nn < size; nn++)
 			{
 			{
-				if (i <= j)
+				if (nn <= mm)
 				{
 				{
-					printf("%2.2f\t", test_mat[j +i*size]);
+					printf("%2.2f\t", test_mat[mm +nn*size]);
 				}
 				}
 				else
 				else
 				{
 				{
@@ -245,24 +245,24 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	}
 	}
 
 
 	*correctness = 1;
 	*correctness = 1;
-	for(x = 0; x < nblocks ; x++)
+	for(n = 0; n < nblocks ; n++)
 	{
 	{
-		for (y = 0; y < nblocks; y++)
+		for (m = 0; m < nblocks; m++)
 		{
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (mpi_rank == rank)
 			if (mpi_rank == rank)
 			{
 			{
-				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
 				{
 				{
-					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
 					{
 					{
-						if (i <= j)
+						if (nn <= mm)
 						{
 						{
-							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-							float err = fabsf(test_mat[j +i*size] - orig) / orig;
+							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
+							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
 							if (err > epsilon)
 							if (err > epsilon)
 							{
 							{
-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
 								*correctness = 0;
 								*correctness = 0;
 								*flops = 0;
 								*flops = 0;
 								break;
 								break;

+ 21 - 18
mpi/examples/matrix_decomposition/mpi_decomposition_matrix.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2010-2013,2015-2017                      CNRS
- * Copyright (C) 2009-2012,2014,2015                      Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2020                 Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,7 +19,7 @@
 #include "mpi_cholesky.h"
 #include "mpi_cholesky.h"
 
 
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes)
+int my_distrib(int y, int x, int nb_nodes)
 {
 {
 	(void)nb_nodes;
 	(void)nb_nodes;
 	//return (x+y) % nb_nodes;
 	//return (x+y) % nb_nodes;
@@ -62,27 +62,30 @@ void matrix_display(float ***bmat, int rank)
 	}
 	}
 }
 }
 
 
+/* Note: bmat is indexed by bmat[m][n][mm+nn*BLOCKSIZE],
+ * i.e. the content of the tiles is column-major, but the array of tiles is
+ * row-major to keep the m,n notation everywhere */
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 {
 {
-	unsigned i,j,x,y;
+	unsigned nn,mm,m,n;
 
 
 	*bmat = malloc(nblocks * sizeof(float **));
 	*bmat = malloc(nblocks * sizeof(float **));
-	for(x=0 ; x<nblocks ; x++)
+	for(m=0 ; m<nblocks ; m++)
 	{
 	{
-		(*bmat)[x] = malloc(nblocks * sizeof(float *));
-		for(y=0 ; y<nblocks ; y++)
+		(*bmat)[m] = malloc(nblocks * sizeof(float *));
+		for(n=0 ; n<nblocks ; n++)
 		{
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (alloc_everywhere || (mpi_rank == rank))
 			if (alloc_everywhere || (mpi_rank == rank))
 			{
 			{
-				starpu_malloc((void **)&(*bmat)[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
-				for (i = 0; i < BLOCKSIZE; i++)
+				starpu_malloc((void **)&(*bmat)[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+				for (nn = 0; nn < BLOCKSIZE; nn++)
 				{
 				{
-					for (j = 0; j < BLOCKSIZE; j++)
+					for (mm = 0; mm < BLOCKSIZE; mm++)
 					{
 					{
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-						(*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
-						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+						(*bmat)[m][n][mm +nn*BLOCKSIZE] = (1.0f/(1.0f+(nn+(m*BLOCKSIZE)+mm+(n*BLOCKSIZE)))) + ((nn+(m*BLOCKSIZE) == mm+(n*BLOCKSIZE))?1.0f*size:0.0f);
+						//mat[mm +nn*size] = ((nn == mm)?1.0f*size:0.0f);
 #endif
 #endif
 					}
 					}
 				}
 				}
@@ -93,19 +96,19 @@ void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 
 
 void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere)
 void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere)
 {
 {
-	unsigned x, y;
+	unsigned m, n;
 
 
-	for(x=0 ; x<nblocks ; x++)
+	for(m=0 ; m<nblocks ; m++)
 	{
 	{
-		for(y=0 ; y<nblocks ; y++)
+		for(n=0 ; n<nblocks ; n++)
 		{
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (alloc_everywhere || (mpi_rank == rank))
 			if (alloc_everywhere || (mpi_rank == rank))
 			{
 			{
-				starpu_free((void *)(*bmat)[x][y]);
+				starpu_free((void *)(*bmat)[m][n]);
 			}
 			}
 		}
 		}
-		free((*bmat)[x]);
+		free((*bmat)[m]);
 	}
 	}
 	free(*bmat);
 	free(*bmat);
 }
 }

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_matrix.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2010-2013,2015,2017                      CNRS
- * Copyright (C) 2009-2012,2014                           Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2020                      Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,7 +20,7 @@
 #define __MPI_CHOLESKY_MATRIX_H__
 #define __MPI_CHOLESKY_MATRIX_H__
 
 
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes);
+int my_distrib(int y, int x, int nb_nodes);
 
 
 void matrix_display(float ***bmat, int rank);
 void matrix_display(float ***bmat, int rank);
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere);
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere);