Browse Source

Checkpoint in all cholesky algo, correct data wont use

Romain LION 5 years ago
parent
commit
afdf3f5659
1 changed files with 27 additions and 5 deletions
  1. 27 5
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

+ 27 - 5
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -86,6 +86,8 @@ starpu_mpi_checkpoint_template_t* checkpoint_p;
 int backup_function(int rank)
 {
 	return (rank+1)%_nodes;
+
+//	return (x%dblockx)+(y%dblocky)*dblockx;
 }
 
 
@@ -144,7 +146,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned nn = size/nblocks;
 
-	starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &k, sizeof(unsigned), nblocks*nblocks+10, backup_function);
+	starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &k, sizeof(k), nblocks*nblocks+10, backup_function);
 	starpu_mpi_checkpoint_template_freeze(checkpoint_p);
 
 	for (k = 0; k < nblocks; k++)
@@ -200,6 +202,8 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned nn = size/nblocks;
 
+	starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &n, sizeof(n), nblocks*nblocks+10, backup_function);
+	starpu_mpi_checkpoint_template_freeze(checkpoint_p);
 	/* Column */
 	for (n = 0; n<nblocks; n++)
 	{
@@ -240,7 +244,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       0);
 			}
 		}
-
+		starpu_mpi_submit_checkpoint_template(*checkpoint_p);
 		starpu_iteration_pop();
 	}
 
@@ -248,7 +252,10 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
 	for (m = 0; m < nblocks; m++)
 		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
+		{
+			if (starpu_data_get_home_node(data_handles[m][n])>=0)
+				starpu_data_wont_use(data_handles[m][n]);
+		}
 }
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
@@ -259,6 +266,9 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned nn = size/nblocks;
 
+
+	starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &a, sizeof(a), nblocks*nblocks+10, backup_function);
+	starpu_mpi_checkpoint_template_freeze(checkpoint_p);
 	/* double-antidiagonal number:
 	 * - a=0 contains (0,0) plus (1,0)
 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
@@ -345,6 +355,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 					       0);
 		}
 
+		q(*checkpoint_p);
 		starpu_iteration_pop();
 	}
 
@@ -352,7 +363,10 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
 	for (m = 0; m < nblocks; m++)
 		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
+		{
+			if (starpu_data_get_home_node(data_handles[m][n])>=0)
+				starpu_data_wont_use(data_handles[m][n]);
+		}
 }
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
@@ -370,6 +384,10 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
 	 * - etc.
 	 */
+
+	starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &a, sizeof(a), nblocks*nblocks+10, backup_function);
+	starpu_mpi_checkpoint_template_freeze(checkpoint_p);
+
 	for (a = 0; a < 4*nblocks; a++)
 	{
 		starpu_iteration_push(a);
@@ -425,6 +443,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
 		}
 
+		starpu_mpi_submit_checkpoint_template(*checkpoint_p);
 		starpu_iteration_pop();
 	}
 
@@ -432,7 +451,10 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
 	for (m = 0; m < nblocks; m++)
 		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
+		{
+			if (starpu_data_get_home_node(data_handles[m][n])>=0)
+				starpu_data_wont_use(data_handles[m][n]);
+		}
 }
 
 /*