14 年之前 · c404cced9f
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -437,12 +437,8 @@ examplebin_PROGRAMS += 				\
 
				 cholesky_and_lu_cholesky_and_lu_SOURCES =			\
			
 
				 	cholesky_and_lu/cholesky_and_lu.c			\
			
 
				 	cholesky_and_lu/cholesky/cholesky_implicit.c		\
			
 
				-	cholesky_and_lu/cholesky/cholesky_implicit_all_machine.c \
			
 
				 	cholesky_and_lu/cholesky/cholesky_models.c		\
			
 
				 	cholesky_and_lu/cholesky/cholesky_kernels.c		\
			
 
				-	cholesky_and_lu/lu/slu_implicit.c			\
			
 
				-	cholesky_and_lu/lu/slu_implicit_pivot.c			\
			
 
				-	cholesky_and_lu/lu/slu_kernels.c			\
			
 
				 	common/blas.c
			
 
				 endif
			
 
				 
			
--- a/examples/cholesky_and_lu/cholesky/cholesky.h
+++ b/examples/cholesky_and_lu/cholesky/cholesky.h
@@ -72,8 +72,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args);
 
				 #endif
			
 
				 
			
 
				 int run_cholesky_grain_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
			
 
				-double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
			
 
				-double run_cholesky_implicit_all_machine(int argc, char **argv);
			
 
				+double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv, double *timing);
			
 
				 int run_cholesky_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
			
 
				 double run_cholesky_tile_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
			
 
				 
			
--- a/examples/cholesky_and_lu/cholesky/cholesky_implicit.c
+++ b/examples/cholesky_and_lu/cholesky/cholesky_implicit.c
@@ -69,7 +69,7 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 
				 	cl22.type = STARPU_SPMD;
			
 
				 }
			
 
				 
			
 
				-static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
			
 
				+static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starpu_sched_ctx *sched_ctx, double *timing)
			
 
				 {
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -85,21 +85,35 @@ static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starp
 
				 	{
			
 
				                 starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				-                starpu_insert_task_to_ctx(sched_ctx, &cl11,
			
 
				-                                   STARPU_PRIORITY, prio_level,
			
 
				-                                   STARPU_RW, sdatakk,
			
 
				-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				-                                   0);
			
 
				+		if(sched_ctx != NULL)
			
 
				+			starpu_insert_task_to_ctx(sched_ctx, &cl11,
			
 
				+						  STARPU_PRIORITY, prio_level,
			
 
				+						  STARPU_RW, sdatakk,
			
 
				+						  STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				+						  0);
			
 
				+		else
			
 
				+			starpu_insert_task(&cl11,
			
 
				+					   STARPU_PRIORITY, prio_level,
			
 
				+					   STARPU_RW, sdatakk,
			
 
				+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				+					   0);
			
 
				 
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				                         starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-                        starpu_insert_task_to_ctx(sched_ctx, &cl21,
			
 
				-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-                                           STARPU_R, sdatakk,
			
 
				-                                           STARPU_RW, sdatakj,
			
 
				-                                           0);
			
 
				+			if(sched_ctx != NULL)
			
 
				+				starpu_insert_task_to_ctx(sched_ctx, &cl21,
			
 
				+							  STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+							  STARPU_R, sdatakk,
			
 
				+							  STARPU_RW, sdatakj,
			
 
				+							  0);
			
 
				+			else
			
 
				+				starpu_insert_task(&cl21,
			
 
				+						   STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+						   STARPU_R, sdatakk,
			
 
				+						   STARPU_RW, sdatakj,
			
 
				+						   0);
			
 
				 
			
 
				 			for (i = k+1; i<nblocks; i++)
			
 
				 			{
			
@@ -107,34 +121,44 @@ static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starp
 
				                                 {
			
 
				 					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				 					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				-					
			
 
				-					starpu_insert_task_to_ctx(sched_ctx, &cl22,
			
 
				-                                                           STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-                                                           STARPU_R, sdataki,
			
 
				-                                                           STARPU_R, sdatakj,
			
 
				-                                                           STARPU_RW, sdataij,
			
 
				-                                                           0);
			
 
				+					if(sched_ctx != NULL)
			
 
				+						starpu_insert_task_to_ctx(sched_ctx, &cl22,
			
 
				+									  STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+									  STARPU_R, sdataki,
			
 
				+									  STARPU_R, sdatakj,
			
 
				+									  STARPU_RW, sdataij,
			
 
				+									  0);
			
 
				+					else 
			
 
				+						starpu_insert_task(&cl22,
			
 
				+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+								   STARPU_R, sdataki,
			
 
				+								   STARPU_R, sdatakj,
			
 
				+								   STARPU_RW, sdataij,
			
 
				+								   0);
			
 
				                                 }
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	//		starpu_task_wait_for_all();
			
 
				-	starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
			
 
				+	if(sched_ctx != NULL)
			
 
				+		starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
			
 
				+	else
			
 
				+		starpu_task_wait_for_all();
			
 
				 
			
 
				 	starpu_data_unpartition(dataA, 0);
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	(*timing) = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+
			
 
				 	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				 
			
 
				 	double flop = (1.0f*n*n*n)/3.0f;
			
 
				 	
			
 
				-	return (flop/timing/1000.0f);
			
 
				+	return (flop/(*timing)/1000.0f);
			
 
				 }
			
 
				 
			
 
				-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
			
 
				+static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx, double *timing)
			
 
				 {
			
 
				 	starpu_data_handle dataA;
			
 
				 
			
@@ -156,10 +180,10 @@ static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
 
				-	return _cholesky(dataA, nblocks, sched_ctx);
			
 
				+	return _cholesky(dataA, nblocks, sched_ctx, timing);
			
 
				 }
			
 
				 
			
 
				-double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
			
 
				+double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv, double *timing)
			
 
				 {
			
 
				 	/* create a simple definite positive symetric matrix example
			
 
				 	 *
			
@@ -204,7 +228,7 @@ double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	double gflops = cholesky(mat, size, size, nblocks, sched_ctx);
			
 
				+	double gflops = cholesky(mat, size, size, nblocks, sched_ctx, timing);
			
 
				 
			
 
				 #ifdef PRINT_OUTPUT
			
 
				 	printf("Results :\n");
			
--- a/examples/cholesky_and_lu/cholesky_and_lu.c
+++ b/examples/cholesky_and_lu/cholesky_and_lu.c
@@ -1,5 +1,4 @@
 
				 #include "cholesky/cholesky.h"
			
 
				-#include "lu/lu_example_float.c"
			
 
				 #include <pthread.h>
			
 
				 
			
 
				 typedef struct {
			
@@ -7,7 +6,12 @@ typedef struct {
 
				   char **argv;
			
 
				 } params;
			
 
				 
			
 
				-#define NSAMPLES 20
			
 
				+typedef struct {
			
 
				+  double flops;
			
 
				+  double avg_timing;
			
 
				+} retvals;
			
 
				+
			
 
				+#define NSAMPLES 10
			
 
				 
			
 
				 struct starpu_sched_ctx sched_ctx;
			
 
				 struct starpu_sched_ctx sched_ctx2;
			
@@ -17,104 +21,67 @@ struct starpu_sched_ctx sched_ctx4;
 
				 void* func_cholesky(void *val){
			
 
				   params *p = (params*)val;
			
 
				 
			
 
				-  int procs[] = {1, 2, 3, 4, 5, 6};
			
 
				+  int procs[] = {1, 3, 4, 5, 6, 7};
			
 
				   starpu_create_sched_ctx(&sched_ctx, "heft", procs, 6, "cholesky1");
			
 
				 
			
 
				   int i;
			
 
				-  double *flops = (double*)malloc(sizeof(double));
			
 
				-  (*flops) = 0;
			
 
				+  retvals *rv  = (retvals*)malloc(sizeof(retvals));
			
 
				+  rv->flops = 0;
			
 
				+  rv->avg_timing = 0;
			
 
				+  double timing = 0;
			
 
				   for(i = 0; i < NSAMPLES; i++)
			
 
				     {
			
 
				-      (*flops) += run_cholesky_implicit(&sched_ctx, p->argc, p->argv);
			
 
				+      rv->flops += run_cholesky_implicit(&sched_ctx, p->argc, p->argv, &timing);
			
 
				+      rv->avg_timing += timing;
			
 
				     }
			
 
				 
			
 
				-  (*flops) /= NSAMPLES;
			
 
				-  return (void*)flops;
			
 
				+  rv->flops /= NSAMPLES;
			
 
				+  rv->avg_timing /= NSAMPLES;
			
 
				+  return (void*)rv;
			
 
				 }
			
 
				 
			
 
				 void* func_cholesky2(void *val){
			
 
				   params *p = (params*)val;
			
 
				 
			
 
				-  int procs[] = {0, 7, 8, 9, 10, 11};
			
 
				+  int procs[] = {0, 8, 9, 10, 11, 12};
			
 
				   starpu_create_sched_ctx(&sched_ctx2, "heft", procs, 6, "cholesky2");
			
 
				 
			
 
				   int i;
			
 
				-  double *flops = (double*)malloc(sizeof(double));
			
 
				-  (*flops) = 0;
			
 
				+  retvals *rv  = (retvals*)malloc(sizeof(retvals));
			
 
				+  rv->flops = 0;
			
 
				+  rv->avg_timing = 0;
			
 
				+  double timing = 0;
			
 
				+
			
 
				   for(i = 0; i < NSAMPLES; i++)
			
 
				     {
			
 
				-      (*flops) += run_cholesky_implicit(&sched_ctx2, p->argc, p->argv);
			
 
				+      rv->flops += run_cholesky_implicit(&sched_ctx2, p->argc, p->argv, &timing);
			
 
				+      rv->avg_timing += timing;
			
 
				     }
			
 
				 
			
 
				-  (*flops) /= NSAMPLES;
			
 
				-  return (void*)flops;
			
 
				+  rv->flops /= NSAMPLES;
			
 
				+  rv->avg_timing /= NSAMPLES;
			
 
				+  return (void*)rv;
			
 
				 }
			
 
				 
			
 
				 void* func_cholesky3(void *val){
			
 
				   params *p = (params*)val;
			
 
				 
			
 
				   int i;
			
 
				-  double *flops = (double*)malloc(sizeof(double));
			
 
				-  (*flops) = 0;
			
 
				-  for(i = 0; i < NSAMPLES; i++)
			
 
				-    {
			
 
				-      (*flops) += run_cholesky_implicit_all_machine(p->argc, p->argv);
			
 
				-    }
			
 
				-
			
 
				-  (*flops) /= NSAMPLES;
			
 
				-  return (void*)flops;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-void* func_lu(void *val){
			
 
				-  params *p = (params*)val;
			
 
				+  retvals *rv  = (retvals*)malloc(sizeof(retvals));
			
 
				+  rv->flops = 0;
			
 
				+  rv->avg_timing = 0;
			
 
				+  double timing = 0;
			
 
				 
			
 
				-  int procs2[] = {0, 7, 8, 9, 10, 11};
			
 
				-  starpu_create_sched_ctx(&sched_ctx3, "heft", procs2, 6, "lu");
			
 
				-
			
 
				-  int i;
			
 
				-  double *flops = (double*)malloc(sizeof(double));
			
 
				-  (*flops) = 0;
			
 
				   for(i = 0; i < NSAMPLES; i++)
			
 
				     {
			
 
				-      (*flops) += run_lu(&sched_ctx3, p->argc, p->argv);
			
 
				+      rv->flops += run_cholesky_implicit(NULL, p->argc, p->argv, &timing);
			
 
				+      rv->avg_timing += timing;
			
 
				     }
			
 
				 
			
 
				-  (*flops) /= NSAMPLES;
			
 
				-  return (void*)flops;
			
 
				-}
			
 
				-
			
 
				-void* func_lu2(void *val){
			
 
				-  params *p = (params*)val;
			
 
				-
			
 
				-  int procs2[] = {1, 2, 3, 4, 5, 6};
			
 
				-  starpu_create_sched_ctx(&sched_ctx4, "heft", procs2, 6, "lu2");
			
 
				-
			
 
				-  int i;
			
 
				-  double *flops = (double*)malloc(sizeof(double));
			
 
				-  (*flops) = 0;
			
 
				-  for(i = 0; i < NSAMPLES; i++)
			
 
				-    {
			
 
				-      (*flops) += run_lu(&sched_ctx4, p->argc, p->argv);
			
 
				-    }
			
 
				-
			
 
				-  (*flops) /= NSAMPLES;
			
 
				-  return (void*)flops;
			
 
				-}
			
 
				-
			
 
				-void* func_lu3(void *val){
			
 
				-  params *p = (params*)val;
			
 
				-
			
 
				-  int i;
			
 
				-  double *flops = (double*)malloc(sizeof(double));
			
 
				-  (*flops) = 0;
			
 
				-  for(i = 0; i < NSAMPLES; i++)
			
 
				-    {
			
 
				-      (*flops) += run_lu(NULL, p->argc, p->argv);
			
 
				-    }
			
 
				+  rv->flops /= NSAMPLES;
			
 
				+  rv->avg_timing /= NSAMPLES;
			
 
				 
			
 
				-  (*flops) /= NSAMPLES;
			
 
				-  return (void*)flops;
			
 
				+  return (void*)rv;
			
 
				 }
			
 
				 
			
 
				 void cholesky_vs_cholesky(params *p){
			
@@ -164,7 +131,9 @@ void cholesky_vs_cholesky(params *p){
 
				   starpu_helper_cublas_shutdown();
			
 
				   starpu_shutdown();
			
 
				 
			
 
				-  printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", *((double*)gflops_cholesky1), *((double*)gflops_cholesky2), *((double*)gflops_cholesky3), *((double*)gflops_cholesky4), *((double*)gflops_cholesky5));
			
 
				+  printf("%2.2f %2.2f %2.2f %2.2f %2.2f ", ((retvals*)gflops_cholesky1)->flops, ((retvals*)gflops_cholesky2)->flops, ((retvals*)gflops_cholesky3)->flops, ((retvals*)gflops_cholesky4)->flops, ((retvals*)gflops_cholesky5)->flops);
			
 
				+
			
 
				+  printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", ((retvals*)gflops_cholesky1)->avg_timing, ((retvals*)gflops_cholesky2)->avg_timing, ((retvals*)gflops_cholesky3)->avg_timing, ((retvals*)gflops_cholesky4)->avg_timing, ((retvals*)gflops_cholesky5)->avg_timing);
			
 
				 
			
 
				   free(gflops_cholesky1);
			
 
				   free(gflops_cholesky2);
			
@@ -173,137 +142,13 @@ void cholesky_vs_cholesky(params *p){
 
				   free(gflops_cholesky5);
			
 
				 }
			
 
				 
			
 
				-void cholesky_vs_lu(params *p){
			
 
				-  /* one cholesky and one lu each one in its own context */
			
 
				-  starpu_init(NULL);
			
 
				-  starpu_helper_cublas_init();
			
 
				-
			
 
				-  pthread_t tid[2];
			
 
				-
			
 
				-  pthread_create(&tid[0], NULL, (void*)func_cholesky, (void*)p);
			
 
				-  pthread_create(&tid[1], NULL, (void*)func_lu, (void*)p);
			
 
				-
			
 
				-  void *gflops_cholesky;
			
 
				-  void *gflops_lu;
			
 
				- 
			
 
				-  pthread_join(tid[0], &gflops_cholesky);
			
 
				-  pthread_join(tid[1], &gflops_lu);
			
 
				-
			
 
				-  starpu_helper_cublas_shutdown();
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-  /*one cholesky and one lu mixed in a single context*/
			
 
				-  starpu_init(NULL);
			
 
				-  starpu_helper_cublas_init();
			
 
				-
			
 
				-  pthread_t tid2[2];
			
 
				-
			
 
				-  pthread_create(&tid2[0], NULL, (void*)func_cholesky3, (void*)p);
			
 
				-  pthread_create(&tid2[1], NULL, (void*)func_lu3, (void*)p);
			
 
				-
			
 
				-  void *gflops_cholesky2;
			
 
				-  void *gflops_lu2;
			
 
				- 
			
 
				-  pthread_join(tid2[0], &gflops_cholesky2);
			
 
				-  pthread_join(tid2[1], &gflops_lu2);
			
 
				-
			
 
				-  starpu_helper_cublas_shutdown();
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-
			
 
				-  /* 1 lu all alone on the whole machine */
			
 
				-  starpu_init(NULL);
			
 
				-  starpu_helper_cublas_init();
			
 
				-
			
 
				-  void *gflops_lu3 = func_lu3(p);
			
 
				-
			
 
				-  starpu_helper_cublas_shutdown();
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-  /* 1 cholesky all alone on the whole machine */
			
 
				-  starpu_init(NULL);
			
 
				-  starpu_helper_cublas_init();
			
 
				-
			
 
				-  void *gflops_cholesky3 = func_cholesky3(p);
			
 
				-
			
 
				-  starpu_helper_cublas_shutdown();
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-  printf("%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f \n", *((double*)gflops_cholesky), *((double*)gflops_lu), *((double*)gflops_cholesky2), *((double*)gflops_lu2), *((double*)gflops_cholesky3), *((double*)gflops_lu3));
			
 
				-
			
 
				-  free(gflops_cholesky);
			
 
				-  free(gflops_cholesky2);
			
 
				-  free(gflops_cholesky3);
			
 
				-  free(gflops_lu);
			
 
				-  free(gflops_lu2);
			
 
				-  free(gflops_lu3);
			
 
				-}
			
 
				-
			
 
				-void lu_vs_lu(params *p){
			
 
				-  /* 2 lu in different ctxs */
			
 
				-  starpu_init(NULL);
			
 
				-  starpu_helper_cublas_init();
			
 
				-
			
 
				-  pthread_t tid[2];
			
 
				-
			
 
				-  pthread_create(&tid[0], NULL, (void*)func_lu, (void*)p);
			
 
				-  pthread_create(&tid[1], NULL, (void*)func_lu2, (void*)p);
			
 
				-
			
 
				-  void *gflops_lu1;
			
 
				-  void *gflops_lu2;
			
 
				- 
			
 
				-  pthread_join(tid[0], &gflops_lu1);
			
 
				-  pthread_join(tid[1], &gflops_lu2);
			
 
				-
			
 
				-  starpu_helper_cublas_shutdown();
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-  /* 1 lu all alone on the whole machine */
			
 
				-  starpu_init(NULL);
			
 
				-  starpu_helper_cublas_init();
			
 
				-
			
 
				-  void *gflops_lu3 = func_lu3(p);
			
 
				-
			
 
				-  starpu_helper_cublas_shutdown();
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-  //   printf("%2.2f\n", *((double*)gflops_lu3));
			
 
				-  printf("%2.2f %2.2f %2.2f\n", *((double*)gflops_lu1), *((double*)gflops_lu2), *((double*)gflops_lu3));
			
 
				-
			
 
				-  /* /\* 2 lu in a single ctx *\/ */
			
 
				-  /* starpu_init(NULL); */
			
 
				-  /* starpu_helper_cublas_init(); */
			
 
				-
			
 
				-  /* pthread_t tid2[2]; */
			
 
				-
			
 
				-  /* pthread_create(&tid2[0], NULL, (void*)func_lu3, (void*)p); */
			
 
				-  /* pthread_create(&tid2[1], NULL, (void*)func_lu3, (void*)p); */
			
 
				-
			
 
				-  /* void *gflops_lu4; */
			
 
				-  /* void *gflops_lu5; */
			
 
				- 
			
 
				-  /* pthread_join(tid2[0], &gflops_lu4); */
			
 
				-  /* pthread_join(tid2[1], &gflops_lu5); */
			
 
				-
			
 
				-  /* starpu_helper_cublas_shutdown(); */
			
 
				-  /* starpu_shutdown(); */
			
 
				-
			
 
				-  /* printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", *((double*)gflops_lu1), *((double*)gflops_lu2), *((double*)gflops_lu3), *((double*)gflops_lu4), *((double*)gflops_lu5)); */
			
 
				-
			
 
				-  /* free(gflops_lu1); */
			
 
				-  /* free(gflops_lu2); */
			
 
				-  /* free(gflops_lu3); */
			
 
				-  /* free(gflops_lu4); */
			
 
				-  /* free(gflops_lu5); */
			
 
				-}
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				   params p;
			
 
				   p.argc = argc;
			
 
				   p.argv = argv;
			
 
				 
			
 
				-  lu_vs_lu(&p);
			
 
				+  cholesky_vs_cholesky(&p);
			
 
				 
			
 
				   return 0;
			
 
				 }
			
--- a/examples/cholesky_and_lu/lu/xlu_kernels.c
+++ b/examples/cholesky_and_lu/lu/xlu_kernels.c
@@ -61,7 +61,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
				 
			
 
				 			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
			
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-			printf("dx = %d dy = %d dz = %d ld21 = %d ld12= %d ld22 = %d\n");
			
 
				+			//			printf("dx = %d dy = %d dz = %d ld21 = %d ld12= %d ld22 = %d\n");
			
 
				 
			
 
				 
			
 
				 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -157,16 +157,12 @@ static int set_changing_ctx_flag(starpu_worker_status changing_ctx, int nworkeri
 
				 void starpu_create_sched_ctx(struct starpu_sched_ctx *sched_ctx, const char *policy_name, int
			
 
				 			     *workerids_in_ctx, int nworkerids_in_ctx, const char *sched_name)
			
 
				 {
			
 
				-	  /* wait for the workers concerned by the change of contex                              
			
 
				-	   * to finish their work in the previous context */
			
 
				-	if(!starpu_wait_for_all_tasks_of_workers(workerids_in_ctx, nworkerids_in_ctx))
			
 
				-	  {
			
 
				-		/* block the workers until the contex is switched */
			
 
				-		set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
			
 
				-		_starpu_create_sched_ctx(sched_ctx, policy_name, workerids_in_ctx, nworkerids_in_ctx, 0, sched_name);
			
 
				-		/* also wait the workers to wake up before using the context */
			
 
				-		set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
			
 
				-	  }
			
 
				+	/* block the workers until the contex is switched */
			
 
				+	set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
			
 
				+	_starpu_create_sched_ctx(sched_ctx, policy_name, workerids_in_ctx, nworkerids_in_ctx, 0, sched_name);
			
 
				+	/* also wait the workers to wake up before using the context */
			
 
				+	set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
			
 
				+
			
 
				 	return;
			
 
				 }
			
 
				 
			
@@ -202,20 +198,24 @@ static void _starpu_remove_sched_ctx_from_worker(struct starpu_worker_s *workera
 
				 
			
 
				 void starpu_delete_sched_ctx(struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	int nworkers = config->topology.nworkers;
			
 
				-
			
 
				-	int i;
			
 
				-	for(i = 0; i < nworkers; i++)
			
 
				+  if(!starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx))
			
 
				 	  {
			
 
				-		struct starpu_worker_s *workerarg = _starpu_get_worker_struct(i);
			
 
				-		_starpu_remove_sched_ctx_from_worker(workerarg, sched_ctx);
			
 
				-	  }
			
 
				-	
			
 
				-	free(sched_ctx->sched_policy);
			
 
				-	sched_ctx->sched_policy = NULL;
			
 
				 
			
 
				-	return;
			
 
				+		int nworkers = sched_ctx->nworkers_in_ctx;
			
 
				+		int workerid;
			
 
				+
			
 
				+		int i;
			
 
				+		for(i = 0; i < nworkers; i++)
			
 
				+		  {
			
 
				+			workerid = sched_ctx->workerid[i];
			
 
				+			struct starpu_worker_s *workerarg = _starpu_get_worker_struct(workerid);
			
 
				+			_starpu_remove_sched_ctx_from_worker(workerarg, sched_ctx);
			
 
				+		  }
			
 
				+	
			
 
				+		free(sched_ctx->sched_policy);
			
 
				+		sched_ctx->sched_policy = NULL;
			
 
				+	  }		
			
 
				+	return;	
			
 
				 }
			
 
				 
			
 
				 void _starpu_delete_all_sched_ctxs()
			
@@ -358,18 +358,13 @@ static void _starpu_add_workers_to_sched_ctx(int *workerids_in_ctx, int nworkeri
 
				 void starpu_add_workers_to_sched_ctx(int *workerids_in_ctx, int nworkerids_in_ctx,
			
 
				 				     struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-  	  /* wait for the workers concerned by the change of contex                              
			
 
				-	   * to finish their work in the previous context */
			
 
				-	if(!starpu_wait_for_all_tasks_of_workers(workerids_in_ctx, nworkerids_in_ctx))
			
 
				-	  {
			
 
				-		/* block the workers until the contex is switched */
			
 
				-		set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
			
 
				-		_starpu_add_workers_to_sched_ctx(workerids_in_ctx, nworkerids_in_ctx, sched_ctx);
			
 
				-		/* also wait the workers to wake up before using the context */
			
 
				-		set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
			
 
				-	  }
			
 
				-	return;
			
 
				+	/* block the workers until the contex is switched */
			
 
				+	set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
			
 
				+	_starpu_add_workers_to_sched_ctx(workerids_in_ctx, nworkerids_in_ctx, sched_ctx);
			
 
				+	/* also wait the workers to wake up before using the context */
			
 
				+	set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
			
 
				 
			
 
				+	return;
			
 
				 }
			
 
				 
			
 
				 static int _starpu_get_first_free_space(int *workerids, int old_nworkerids_in_ctx)
			
@@ -424,7 +419,7 @@ static void _starpu_remove_workers_from_sched_ctx(int *workerids_in_ctx, int nwo
 
				 			workerid = sched_ctx->workerid[i];
			
 
				 			struct starpu_worker_s *workerarg = _starpu_get_worker_struct(workerid);
			
 
				 			_starpu_remove_sched_ctx_from_worker(workerarg, sched_ctx);
			
 
				-sched_ctx->workerid[i] = -1;
			
 
				+			sched_ctx->workerid[i] = -1;
			
 
				 		  }
			
 
				 
			
 
				 		sched_ctx->nworkers_in_ctx = 0;
			
@@ -458,7 +453,7 @@ sched_ctx->workerid[i] = -1;
 
				 void starpu_remove_workers_from_sched_ctx(int *workerids_in_ctx, int nworkerids_in_ctx, 
			
 
				 					  struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	  /* wait for the workers concerned by the change of contex                              
			
 
				+	  /* wait for the workers concerned by the change of contex                       
			
 
				 	   * to finish their work in the previous context */
			
 
				 	if(!starpu_wait_for_all_tasks_of_workers(workerids_in_ctx, nworkerids_in_ctx))
			
 
				 	  {
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -161,7 +161,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				   for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				     {
			
 
				       worker = sched_ctx->workerid[worker_in_ctx];
			
 
				-      //      printf("%s: compute perf for %d\n", sched_ctx->sched_name, worker);
			
 
				       /* Sometimes workers didn't take the tasks as early as we expected */
			
 
				       exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				       exp_end[worker] = exp_start[worker] + exp_len[worker];
			
@@ -170,7 +169,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 
			
 
				       if (!starpu_worker_may_execute_task(worker, task))
			
 
				 	{
			
 
				-	  //	  printf("worker %d may not execute task\n", worker);
			
 
				 	  /* no one on that queue may execute this task */
			
 
				 	  continue;
			
 
				 	}
			
@@ -191,7 +189,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				       }
			
 
				 
			
 
				       double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-      //      printf("%d: model %d local_task_len = %2.2f local_data_pen = %2.2f local_power = %2.2f \n", worker, task->cl->model->type, local_task_length[worker], local_data_penalty[worker], local_power[worker]);
			
 
				+
			
 
				       if (ntasks_best == -1
			
 
				 	  || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				 	  || (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
@@ -228,7 +226,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				     }
			
 
				 
			
 
				   *forced_best = unknown?ntasks_best:-1;
			
 
				-  //  printf("******************* the winner is %d\n", *forced_best);
			
 
				   *best_exp_endp = best_exp_end;
			
 
				   *max_exp_endp = max_exp_end;
			
 
				 }
			
--- a/tests/cholesky_and_lu/sched.sh
+++ b/tests/cholesky_and_lu/sched.sh
@@ -36,7 +36,7 @@ do
 
				 
			
 
				     echo "$ROOTDIR/examples/$BENCH_NAME/$BENCH_NAME $OPTIONS"
			
 
				 
			
 
				-    val=`$ROOTDIR/examples/$BENCH_NAME/$BENCH_NAME $OPTIONS`
			
 
				+    val=`STARPU_NCUDA=2 $ROOTDIR/examples/$BENCH_NAME/$BENCH_NAME $OPTIONS`
			
 
				 
			
 
				     echo "$size $val"
			
 
				     echo "$size $val" >> $filename