Explorar o código

Removed flush and modified bench

Andra Hugo %!s(int64=14) %!d(string=hai) anos
pai
achega
c404cced9f

+ 0 - 4
examples/Makefile.am

@@ -437,12 +437,8 @@ examplebin_PROGRAMS += 				\
 cholesky_and_lu_cholesky_and_lu_SOURCES =			\
 	cholesky_and_lu/cholesky_and_lu.c			\
 	cholesky_and_lu/cholesky/cholesky_implicit.c		\
-	cholesky_and_lu/cholesky/cholesky_implicit_all_machine.c \
 	cholesky_and_lu/cholesky/cholesky_models.c		\
 	cholesky_and_lu/cholesky/cholesky_kernels.c		\
-	cholesky_and_lu/lu/slu_implicit.c			\
-	cholesky_and_lu/lu/slu_implicit_pivot.c			\
-	cholesky_and_lu/lu/slu_kernels.c			\
 	common/blas.c
 endif
 

+ 1 - 2
examples/cholesky_and_lu/cholesky/cholesky.h

@@ -72,8 +72,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args);
 #endif
 
 int run_cholesky_grain_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
-double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
-double run_cholesky_implicit_all_machine(int argc, char **argv);
+double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv, double *timing);
 int run_cholesky_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
 double run_cholesky_tile_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv);
 

+ 50 - 26
examples/cholesky_and_lu/cholesky/cholesky_implicit.c

@@ -69,7 +69,7 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 	cl22.type = STARPU_SPMD;
 }
 
-static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
+static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starpu_sched_ctx *sched_ctx, double *timing)
 {
 	struct timeval start;
 	struct timeval end;
@@ -85,21 +85,35 @@ static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starp
 	{
                 starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 
-                starpu_insert_task_to_ctx(sched_ctx, &cl11,
-                                   STARPU_PRIORITY, prio_level,
-                                   STARPU_RW, sdatakk,
-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
-                                   0);
+		if(sched_ctx != NULL)
+			starpu_insert_task_to_ctx(sched_ctx, &cl11,
+						  STARPU_PRIORITY, prio_level,
+						  STARPU_RW, sdatakk,
+						  STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+						  0);
+		else
+			starpu_insert_task(&cl11,
+					   STARPU_PRIORITY, prio_level,
+					   STARPU_RW, sdatakk,
+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					   0);
 
 		for (j = k+1; j<nblocks; j++)
 		{
                         starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
 
-                        starpu_insert_task_to_ctx(sched_ctx, &cl21,
-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
-                                           STARPU_R, sdatakk,
-                                           STARPU_RW, sdatakj,
-                                           0);
+			if(sched_ctx != NULL)
+				starpu_insert_task_to_ctx(sched_ctx, &cl21,
+							  STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
+							  STARPU_R, sdatakk,
+							  STARPU_RW, sdatakj,
+							  0);
+			else
+				starpu_insert_task(&cl21,
+						   STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
+						   STARPU_R, sdatakk,
+						   STARPU_RW, sdatakj,
+						   0);
 
 			for (i = k+1; i<nblocks; i++)
 			{
@@ -107,34 +121,44 @@ static double _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starp
                                 {
 					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
 					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
-					
-					starpu_insert_task_to_ctx(sched_ctx, &cl22,
-                                                           STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
-                                                           STARPU_R, sdataki,
-                                                           STARPU_R, sdatakj,
-                                                           STARPU_RW, sdataij,
-                                                           0);
+					if(sched_ctx != NULL)
+						starpu_insert_task_to_ctx(sched_ctx, &cl22,
+									  STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
+									  STARPU_R, sdataki,
+									  STARPU_R, sdatakj,
+									  STARPU_RW, sdataij,
+									  0);
+					else 
+						starpu_insert_task(&cl22,
+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
+								   STARPU_R, sdataki,
+								   STARPU_R, sdatakj,
+								   STARPU_RW, sdataij,
+								   0);
                                 }
 			}
 		}
 	}
 
-	//		starpu_task_wait_for_all();
-	starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
+	if(sched_ctx != NULL)
+		starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
+	else
+		starpu_task_wait_for_all();
 
 	starpu_data_unpartition(dataA, 0);
 
 	gettimeofday(&end, NULL);
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	(*timing) = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
 	unsigned long n = starpu_matrix_get_nx(dataA);
 
 	double flop = (1.0f*n*n*n)/3.0f;
 	
-	return (flop/timing/1000.0f);
+	return (flop/(*timing)/1000.0f);
 }
 
-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
+static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx, double *timing)
 {
 	starpu_data_handle dataA;
 
@@ -156,10 +180,10 @@ static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
-	return _cholesky(dataA, nblocks, sched_ctx);
+	return _cholesky(dataA, nblocks, sched_ctx, timing);
 }
 
-double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
+double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char **argv, double *timing)
 {
 	/* create a simple definite positive symetric matrix example
 	 *
@@ -204,7 +228,7 @@ double run_cholesky_implicit(struct starpu_sched_ctx *sched_ctx, int argc, char
 	}
 #endif
 
-	double gflops = cholesky(mat, size, size, nblocks, sched_ctx);
+	double gflops = cholesky(mat, size, size, nblocks, sched_ctx, timing);
 
 #ifdef PRINT_OUTPUT
 	printf("Results :\n");

+ 40 - 195
examples/cholesky_and_lu/cholesky_and_lu.c

@@ -1,5 +1,4 @@
 #include "cholesky/cholesky.h"
-#include "lu/lu_example_float.c"
 #include <pthread.h>
 
 typedef struct {
@@ -7,7 +6,12 @@ typedef struct {
   char **argv;
 } params;
 
-#define NSAMPLES 20
+typedef struct {
+  double flops;
+  double avg_timing;
+} retvals;
+
+#define NSAMPLES 10
 
 struct starpu_sched_ctx sched_ctx;
 struct starpu_sched_ctx sched_ctx2;
@@ -17,104 +21,67 @@ struct starpu_sched_ctx sched_ctx4;
 void* func_cholesky(void *val){
   params *p = (params*)val;
 
-  int procs[] = {1, 2, 3, 4, 5, 6};
+  int procs[] = {1, 3, 4, 5, 6, 7};
   starpu_create_sched_ctx(&sched_ctx, "heft", procs, 6, "cholesky1");
 
   int i;
-  double *flops = (double*)malloc(sizeof(double));
-  (*flops) = 0;
+  retvals *rv  = (retvals*)malloc(sizeof(retvals));
+  rv->flops = 0;
+  rv->avg_timing = 0;
+  double timing = 0;
   for(i = 0; i < NSAMPLES; i++)
     {
-      (*flops) += run_cholesky_implicit(&sched_ctx, p->argc, p->argv);
+      rv->flops += run_cholesky_implicit(&sched_ctx, p->argc, p->argv, &timing);
+      rv->avg_timing += timing;
     }
 
-  (*flops) /= NSAMPLES;
-  return (void*)flops;
+  rv->flops /= NSAMPLES;
+  rv->avg_timing /= NSAMPLES;
+  return (void*)rv;
 }
 
 void* func_cholesky2(void *val){
   params *p = (params*)val;
 
-  int procs[] = {0, 7, 8, 9, 10, 11};
+  int procs[] = {0, 8, 9, 10, 11, 12};
   starpu_create_sched_ctx(&sched_ctx2, "heft", procs, 6, "cholesky2");
 
   int i;
-  double *flops = (double*)malloc(sizeof(double));
-  (*flops) = 0;
+  retvals *rv  = (retvals*)malloc(sizeof(retvals));
+  rv->flops = 0;
+  rv->avg_timing = 0;
+  double timing = 0;
+
   for(i = 0; i < NSAMPLES; i++)
     {
-      (*flops) += run_cholesky_implicit(&sched_ctx2, p->argc, p->argv);
+      rv->flops += run_cholesky_implicit(&sched_ctx2, p->argc, p->argv, &timing);
+      rv->avg_timing += timing;
     }
 
-  (*flops) /= NSAMPLES;
-  return (void*)flops;
+  rv->flops /= NSAMPLES;
+  rv->avg_timing /= NSAMPLES;
+  return (void*)rv;
 }
 
 void* func_cholesky3(void *val){
   params *p = (params*)val;
 
   int i;
-  double *flops = (double*)malloc(sizeof(double));
-  (*flops) = 0;
-  for(i = 0; i < NSAMPLES; i++)
-    {
-      (*flops) += run_cholesky_implicit_all_machine(p->argc, p->argv);
-    }
-
-  (*flops) /= NSAMPLES;
-  return (void*)flops;
-}
-
-
-void* func_lu(void *val){
-  params *p = (params*)val;
+  retvals *rv  = (retvals*)malloc(sizeof(retvals));
+  rv->flops = 0;
+  rv->avg_timing = 0;
+  double timing = 0;
 
-  int procs2[] = {0, 7, 8, 9, 10, 11};
-  starpu_create_sched_ctx(&sched_ctx3, "heft", procs2, 6, "lu");
-
-  int i;
-  double *flops = (double*)malloc(sizeof(double));
-  (*flops) = 0;
   for(i = 0; i < NSAMPLES; i++)
     {
-      (*flops) += run_lu(&sched_ctx3, p->argc, p->argv);
+      rv->flops += run_cholesky_implicit(NULL, p->argc, p->argv, &timing);
+      rv->avg_timing += timing;
     }
 
-  (*flops) /= NSAMPLES;
-  return (void*)flops;
-}
-
-void* func_lu2(void *val){
-  params *p = (params*)val;
-
-  int procs2[] = {1, 2, 3, 4, 5, 6};
-  starpu_create_sched_ctx(&sched_ctx4, "heft", procs2, 6, "lu2");
-
-  int i;
-  double *flops = (double*)malloc(sizeof(double));
-  (*flops) = 0;
-  for(i = 0; i < NSAMPLES; i++)
-    {
-      (*flops) += run_lu(&sched_ctx4, p->argc, p->argv);
-    }
-
-  (*flops) /= NSAMPLES;
-  return (void*)flops;
-}
-
-void* func_lu3(void *val){
-  params *p = (params*)val;
-
-  int i;
-  double *flops = (double*)malloc(sizeof(double));
-  (*flops) = 0;
-  for(i = 0; i < NSAMPLES; i++)
-    {
-      (*flops) += run_lu(NULL, p->argc, p->argv);
-    }
+  rv->flops /= NSAMPLES;
+  rv->avg_timing /= NSAMPLES;
 
-  (*flops) /= NSAMPLES;
-  return (void*)flops;
+  return (void*)rv;
 }
 
 void cholesky_vs_cholesky(params *p){
@@ -164,7 +131,9 @@ void cholesky_vs_cholesky(params *p){
   starpu_helper_cublas_shutdown();
   starpu_shutdown();
 
-  printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", *((double*)gflops_cholesky1), *((double*)gflops_cholesky2), *((double*)gflops_cholesky3), *((double*)gflops_cholesky4), *((double*)gflops_cholesky5));
+  printf("%2.2f %2.2f %2.2f %2.2f %2.2f ", ((retvals*)gflops_cholesky1)->flops, ((retvals*)gflops_cholesky2)->flops, ((retvals*)gflops_cholesky3)->flops, ((retvals*)gflops_cholesky4)->flops, ((retvals*)gflops_cholesky5)->flops);
+
+  printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", ((retvals*)gflops_cholesky1)->avg_timing, ((retvals*)gflops_cholesky2)->avg_timing, ((retvals*)gflops_cholesky3)->avg_timing, ((retvals*)gflops_cholesky4)->avg_timing, ((retvals*)gflops_cholesky5)->avg_timing);
 
   free(gflops_cholesky1);
   free(gflops_cholesky2);
@@ -173,137 +142,13 @@ void cholesky_vs_cholesky(params *p){
   free(gflops_cholesky5);
 }
 
-void cholesky_vs_lu(params *p){
-  /* one cholesky and one lu each one in its own context */
-  starpu_init(NULL);
-  starpu_helper_cublas_init();
-
-  pthread_t tid[2];
-
-  pthread_create(&tid[0], NULL, (void*)func_cholesky, (void*)p);
-  pthread_create(&tid[1], NULL, (void*)func_lu, (void*)p);
-
-  void *gflops_cholesky;
-  void *gflops_lu;
- 
-  pthread_join(tid[0], &gflops_cholesky);
-  pthread_join(tid[1], &gflops_lu);
-
-  starpu_helper_cublas_shutdown();
-  starpu_shutdown();
-
-  /*one cholesky and one lu mixed in a single context*/
-  starpu_init(NULL);
-  starpu_helper_cublas_init();
-
-  pthread_t tid2[2];
-
-  pthread_create(&tid2[0], NULL, (void*)func_cholesky3, (void*)p);
-  pthread_create(&tid2[1], NULL, (void*)func_lu3, (void*)p);
-
-  void *gflops_cholesky2;
-  void *gflops_lu2;
- 
-  pthread_join(tid2[0], &gflops_cholesky2);
-  pthread_join(tid2[1], &gflops_lu2);
-
-  starpu_helper_cublas_shutdown();
-  starpu_shutdown();
-
-
-  /* 1 lu all alone on the whole machine */
-  starpu_init(NULL);
-  starpu_helper_cublas_init();
-
-  void *gflops_lu3 = func_lu3(p);
-
-  starpu_helper_cublas_shutdown();
-  starpu_shutdown();
-
-  /* 1 cholesky all alone on the whole machine */
-  starpu_init(NULL);
-  starpu_helper_cublas_init();
-
-  void *gflops_cholesky3 = func_cholesky3(p);
-
-  starpu_helper_cublas_shutdown();
-  starpu_shutdown();
-
-  printf("%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f \n", *((double*)gflops_cholesky), *((double*)gflops_lu), *((double*)gflops_cholesky2), *((double*)gflops_lu2), *((double*)gflops_cholesky3), *((double*)gflops_lu3));
-
-  free(gflops_cholesky);
-  free(gflops_cholesky2);
-  free(gflops_cholesky3);
-  free(gflops_lu);
-  free(gflops_lu2);
-  free(gflops_lu3);
-}
-
-void lu_vs_lu(params *p){
-  /* 2 lu in different ctxs */
-  starpu_init(NULL);
-  starpu_helper_cublas_init();
-
-  pthread_t tid[2];
-
-  pthread_create(&tid[0], NULL, (void*)func_lu, (void*)p);
-  pthread_create(&tid[1], NULL, (void*)func_lu2, (void*)p);
-
-  void *gflops_lu1;
-  void *gflops_lu2;
- 
-  pthread_join(tid[0], &gflops_lu1);
-  pthread_join(tid[1], &gflops_lu2);
-
-  starpu_helper_cublas_shutdown();
-  starpu_shutdown();
-
-  /* 1 lu all alone on the whole machine */
-  starpu_init(NULL);
-  starpu_helper_cublas_init();
-
-  void *gflops_lu3 = func_lu3(p);
-
-  starpu_helper_cublas_shutdown();
-  starpu_shutdown();
-
-  //   printf("%2.2f\n", *((double*)gflops_lu3));
-  printf("%2.2f %2.2f %2.2f\n", *((double*)gflops_lu1), *((double*)gflops_lu2), *((double*)gflops_lu3));
-
-  /* /\* 2 lu in a single ctx *\/ */
-  /* starpu_init(NULL); */
-  /* starpu_helper_cublas_init(); */
-
-  /* pthread_t tid2[2]; */
-
-  /* pthread_create(&tid2[0], NULL, (void*)func_lu3, (void*)p); */
-  /* pthread_create(&tid2[1], NULL, (void*)func_lu3, (void*)p); */
-
-  /* void *gflops_lu4; */
-  /* void *gflops_lu5; */
- 
-  /* pthread_join(tid2[0], &gflops_lu4); */
-  /* pthread_join(tid2[1], &gflops_lu5); */
-
-  /* starpu_helper_cublas_shutdown(); */
-  /* starpu_shutdown(); */
-
-  /* printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", *((double*)gflops_lu1), *((double*)gflops_lu2), *((double*)gflops_lu3), *((double*)gflops_lu4), *((double*)gflops_lu5)); */
-
-  /* free(gflops_lu1); */
-  /* free(gflops_lu2); */
-  /* free(gflops_lu3); */
-  /* free(gflops_lu4); */
-  /* free(gflops_lu5); */
-}
-
 int main(int argc, char **argv)
 {
   params p;
   p.argc = argc;
   p.argv = argv;
 
-  lu_vs_lu(&p);
+  cholesky_vs_cholesky(&p);
 
   return 0;
 }

+ 1 - 1
examples/cholesky_and_lu/lu/xlu_kernels.c

@@ -61,7 +61,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
 			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
-			printf("dx = %d dy = %d dz = %d ld21 = %d ld12= %d ld22 = %d\n");
+			//			printf("dx = %d dy = %d dz = %d ld21 = %d ld12= %d ld22 = %d\n");
 
 
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,

+ 30 - 35
src/core/sched_ctx.c

@@ -157,16 +157,12 @@ static int set_changing_ctx_flag(starpu_worker_status changing_ctx, int nworkeri
 void starpu_create_sched_ctx(struct starpu_sched_ctx *sched_ctx, const char *policy_name, int
 			     *workerids_in_ctx, int nworkerids_in_ctx, const char *sched_name)
 {
-	  /* wait for the workers concerned by the change of contex                              
-	   * to finish their work in the previous context */
-	if(!starpu_wait_for_all_tasks_of_workers(workerids_in_ctx, nworkerids_in_ctx))
-	  {
-		/* block the workers until the contex is switched */
-		set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
-		_starpu_create_sched_ctx(sched_ctx, policy_name, workerids_in_ctx, nworkerids_in_ctx, 0, sched_name);
-		/* also wait the workers to wake up before using the context */
-		set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
-	  }
+	/* block the workers until the contex is switched */
+	set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
+	_starpu_create_sched_ctx(sched_ctx, policy_name, workerids_in_ctx, nworkerids_in_ctx, 0, sched_name);
+	/* also wait the workers to wake up before using the context */
+	set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
+
 	return;
 }
 
@@ -202,20 +198,24 @@ static void _starpu_remove_sched_ctx_from_worker(struct starpu_worker_s *workera
 
 void starpu_delete_sched_ctx(struct starpu_sched_ctx *sched_ctx)
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	int nworkers = config->topology.nworkers;
-
-	int i;
-	for(i = 0; i < nworkers; i++)
+  if(!starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx))
 	  {
-		struct starpu_worker_s *workerarg = _starpu_get_worker_struct(i);
-		_starpu_remove_sched_ctx_from_worker(workerarg, sched_ctx);
-	  }
-	
-	free(sched_ctx->sched_policy);
-	sched_ctx->sched_policy = NULL;
 
-	return;
+		int nworkers = sched_ctx->nworkers_in_ctx;
+		int workerid;
+
+		int i;
+		for(i = 0; i < nworkers; i++)
+		  {
+			workerid = sched_ctx->workerid[i];
+			struct starpu_worker_s *workerarg = _starpu_get_worker_struct(workerid);
+			_starpu_remove_sched_ctx_from_worker(workerarg, sched_ctx);
+		  }
+	
+		free(sched_ctx->sched_policy);
+		sched_ctx->sched_policy = NULL;
+	  }		
+	return;	
 }
 
 void _starpu_delete_all_sched_ctxs()
@@ -358,18 +358,13 @@ static void _starpu_add_workers_to_sched_ctx(int *workerids_in_ctx, int nworkeri
 void starpu_add_workers_to_sched_ctx(int *workerids_in_ctx, int nworkerids_in_ctx,
 				     struct starpu_sched_ctx *sched_ctx)
 {
-  	  /* wait for the workers concerned by the change of contex                              
-	   * to finish their work in the previous context */
-	if(!starpu_wait_for_all_tasks_of_workers(workerids_in_ctx, nworkerids_in_ctx))
-	  {
-		/* block the workers until the contex is switched */
-		set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
-		_starpu_add_workers_to_sched_ctx(workerids_in_ctx, nworkerids_in_ctx, sched_ctx);
-		/* also wait the workers to wake up before using the context */
-		set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
-	  }
-	return;
+	/* block the workers until the contex is switched */
+	set_changing_ctx_flag(STATUS_CHANGING_CTX, nworkerids_in_ctx, workerids_in_ctx);
+	_starpu_add_workers_to_sched_ctx(workerids_in_ctx, nworkerids_in_ctx, sched_ctx);
+	/* also wait the workers to wake up before using the context */
+	set_changing_ctx_flag(STATUS_UNKNOWN, nworkerids_in_ctx, workerids_in_ctx);
 
+	return;
 }
 
 static int _starpu_get_first_free_space(int *workerids, int old_nworkerids_in_ctx)
@@ -424,7 +419,7 @@ static void _starpu_remove_workers_from_sched_ctx(int *workerids_in_ctx, int nwo
 			workerid = sched_ctx->workerid[i];
 			struct starpu_worker_s *workerarg = _starpu_get_worker_struct(workerid);
 			_starpu_remove_sched_ctx_from_worker(workerarg, sched_ctx);
-sched_ctx->workerid[i] = -1;
+			sched_ctx->workerid[i] = -1;
 		  }
 
 		sched_ctx->nworkers_in_ctx = 0;
@@ -458,7 +453,7 @@ sched_ctx->workerid[i] = -1;
 void starpu_remove_workers_from_sched_ctx(int *workerids_in_ctx, int nworkerids_in_ctx, 
 					  struct starpu_sched_ctx *sched_ctx)
 {
-	  /* wait for the workers concerned by the change of contex                              
+	  /* wait for the workers concerned by the change of contex                       
 	   * to finish their work in the previous context */
 	if(!starpu_wait_for_all_tasks_of_workers(workerids_in_ctx, nworkerids_in_ctx))
 	  {

+ 1 - 4
src/sched_policies/heft.c

@@ -161,7 +161,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
   for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
     {
       worker = sched_ctx->workerid[worker_in_ctx];
-      //      printf("%s: compute perf for %d\n", sched_ctx->sched_name, worker);
       /* Sometimes workers didn't take the tasks as early as we expected */
       exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
       exp_end[worker] = exp_start[worker] + exp_len[worker];
@@ -170,7 +169,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
       if (!starpu_worker_may_execute_task(worker, task))
 	{
-	  //	  printf("worker %d may not execute task\n", worker);
 	  /* no one on that queue may execute this task */
 	  continue;
 	}
@@ -191,7 +189,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
       }
 
       double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
-      //      printf("%d: model %d local_task_len = %2.2f local_data_pen = %2.2f local_power = %2.2f \n", worker, task->cl->model->type, local_task_length[worker], local_data_penalty[worker], local_power[worker]);
+
       if (ntasks_best == -1
 	  || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
 	  || (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
@@ -228,7 +226,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
     }
 
   *forced_best = unknown?ntasks_best:-1;
-  //  printf("******************* the winner is %d\n", *forced_best);
   *best_exp_endp = best_exp_end;
   *max_exp_endp = max_exp_end;
 }

+ 1 - 1
tests/cholesky_and_lu/sched.sh

@@ -36,7 +36,7 @@ do
 
     echo "$ROOTDIR/examples/$BENCH_NAME/$BENCH_NAME $OPTIONS"
 
-    val=`$ROOTDIR/examples/$BENCH_NAME/$BENCH_NAME $OPTIONS`
+    val=`STARPU_NCUDA=2 $ROOTDIR/examples/$BENCH_NAME/$BENCH_NAME $OPTIONS`
 
     echo "$size $val"
     echo "$size $val" >> $filename