14 years ago · ef5bbf9eb0
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -55,6 +55,7 @@ EXTRA_DIST = 					\
 
				 	lu/xlu_implicit_pivot.c			\
			
 
				 	lu/xlu_kernels.c			\
			
 
				 	lu/lu_example.c				\
			
 
				+	sched_ctx_utils/sched_ctx_utils.c		\
			
 
				 	cholesky_2ctxs/cholesky_2ctxs.c			\
			
 
				 	cholesky_no_ctxs/cholesky_no_ctxs.c			\
			
 
				 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
			
@@ -118,6 +119,7 @@ noinst_HEADERS = 				\
 
				 	lu/complex_double.h			\
			
 
				 	lu/blas_complex.h			\
			
 
				 	cholesky/cholesky.h			\
			
 
				+	sched_ctx_utils/sched_ctx_utils.h	\
			
 
				 	common/blas_model.h			\
			
 
				 	common/blas.h				\
			
 
				 	mult/simple.h				\
			
@@ -435,6 +437,7 @@ cholesky_cholesky_implicit_SOURCES =		\
 
				 	cholesky/cholesky_implicit.c		\
			
 
				 	cholesky/cholesky_models.c		\
			
 
				 	cholesky/cholesky_kernels.c		\
			
 
				+	sched_ctx_utils/sched_ctx_utils.c	\
			
 
				 	common/blas.c
			
 
				 
			
 
				 cholesky_cholesky_implicit_LDADD =		\
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -61,6 +61,8 @@ static unsigned nbigblocks = 8;
 
				 static unsigned pinned = 0;
			
 
				 static unsigned noprio = 0;
			
 
				 static unsigned check = 0;
			
 
				+static unsigned with_ctxs = 0;
			
 
				+static unsigned with_noctxs = 0;
			
 
				 
			
 
				 void chol_cpu_codelet_update_u11(void **, void *);
			
 
				 void chol_cpu_codelet_update_u21(void **, void *);
			
@@ -80,37 +82,46 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
				 {
			
 
				 	int i;
			
 
				 	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-with_ctxs") == 0) {
			
 
				+			with_ctxs = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+		if (strcmp(argv[i], "-with_noctxs") == 0) {
			
 
				+			with_noctxs = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				 		if (strcmp(argv[i], "-size") == 0) {
			
 
				-		        char *argptr;
			
 
				+			char *argptr;
			
 
				 			size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-
			
 
				+		
			
 
				 		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-		        char *argptr;
			
 
				+			char *argptr;
			
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-
			
 
				+		
			
 
				 		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				-		        char *argptr;
			
 
				+			char *argptr;
			
 
				 			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-
			
 
				+		
			
 
				 		if (strcmp(argv[i], "-pin") == 0) {
			
 
				 			pinned = 1;
			
 
				 		}
			
 
				-
			
 
				+		
			
 
				 		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				 			noprio = 1;
			
 
				 		}
			
 
				-
			
 
				+		
			
 
				 		if (strcmp(argv[i], "-check") == 0) {
			
 
				 			check = 1;
			
 
				 		}
			
 
				-
			
 
				+		
			
 
				 		if (strcmp(argv[i], "-h") == 0) {
			
 
				 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
			
 
				-		}
			
 
				-	}
			
 
				+		}	
			
 
				+	}	
			
 
				 }
			
 
				 
			
 
				 #endif /* __DW_CHOLESKY_H__ */
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -17,7 +17,7 @@
 
				  */
			
 
				 
			
 
				 #include "cholesky.h"
			
 
				-
			
 
				+#include "../sched_ctx_utils/sched_ctx_utils.h"
			
 
				 /*
			
 
				  *	Create the codelets
			
 
				  */
			
@@ -126,13 +126,20 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				-	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				 
			
 
				 	double flop = (1.0f*n*n*n)/3.0f;
			
 
				-	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+
			
 
				+	if(with_ctxs || with_noctxs)
			
 
				+		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
			
 
				+	else
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+		FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				+	
			
 
				+		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
@@ -158,19 +165,8 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 	_cholesky(dataA, nblocks);
			
 
				 }
			
 
				 
			
 
				-int main(int argc, char **argv)
			
 
				+static void execute_cholesky(unsigned size, unsigned nblocks)
			
 
				 {
			
 
				-	/* create a simple definite positive symetric matrix example
			
 
				-	 *
			
 
				-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				-	 * */
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	starpu_helper_cublas_init();
			
 
				-
			
 
				 	float *mat;
			
 
				 	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				 
			
@@ -274,6 +270,34 @@ int main(int argc, char **argv)
 
				 	        }
			
 
				 	}
			
 
				 
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	if(with_ctxs || with_noctxs)
			
 
				+		parse_args_ctx(argc, argv);
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+
			
 
				+	starpu_helper_cublas_init();
			
 
				+
			
 
				+	if(with_ctxs)
			
 
				+	{
			
 
				+		construct_contexts(execute_cholesky);
			
 
				+		start_2benchs(execute_cholesky);
			
 
				+	}
			
 
				+	else if(with_noctxs)
			
 
				+		start_2benchs(execute_cholesky);
			
 
				+	else
			
 
				+		execute_cholesky(size, nblocks);
			
 
				+
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky.h
+++ b/examples/cholesky_2ctxs/cholesky/cholesky.h
@@ -71,7 +71,7 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 
				 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				 #endif
			
 
				 
			
 
				-double run_cholesky_implicit(unsigned sched_ctx, int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier);
			
 
				+double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier);
			
 
				 
			
 
				 extern struct starpu_perfmodel_t chol_model_11;
			
 
				 extern struct starpu_perfmodel_t chol_model_21;
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_implicit.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_implicit.c
@@ -70,7 +70,7 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 
				 	cl22.type = STARPU_SPMD;
			
 
				 }
			
 
				 
			
 
				-static double _cholesky(starpu_data_handle dataA, unsigned nblocks, unsigned sched_ctx, double *timing)
			
 
				+static double _cholesky(starpu_data_handle dataA, unsigned nblocks, double *timing)
			
 
				 {
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -137,7 +137,7 @@ static double _cholesky(starpu_data_handle dataA, unsigned nblocks, unsigned sch
 
				 	return gflops;
			
 
				 }
			
 
				 
			
 
				-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned sched_ctx, double *timing)
			
 
				+static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, double *timing)
			
 
				 {
			
 
				 	starpu_data_handle dataA;
			
 
				 
			
@@ -156,12 +156,12 @@ static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks
 
				 	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				-	double gflops = _cholesky(dataA, nblocks, sched_ctx, timing);
			
 
				+	double gflops = _cholesky(dataA, nblocks, timing);
			
 
				 	starpu_data_unregister(dataA);
			
 
				 	return gflops;
			
 
				 }
			
 
				 
			
 
				-double run_cholesky_implicit(unsigned sched_ctx, int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier)
			
 
				+double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier)
			
 
				 {
			
 
				 	/* create a simple definite positive symetric matrix example
			
 
				 	 *
			
@@ -208,8 +208,7 @@ double run_cholesky_implicit(unsigned sched_ctx, int start, int argc, char **arg
 
				 		printf("\n");
			
 
				 	}
			
 
				 #endif
			
 
				-//	starpu_set_sched_ctx(&sched_ctx);
			
 
				-	double gflops = cholesky(mat, size, size, nblocks, sched_ctx, timing);
			
 
				+	double gflops = cholesky(mat, size, size, nblocks, timing);
			
 
				 
			
 
				 #ifdef PRINT_OUTPUT
			
 
				 	printf("Results :\n");
			
--- a/examples/cholesky_2ctxs/cholesky_2ctxs.c
+++ b/examples/cholesky_2ctxs/cholesky_2ctxs.c
@@ -52,7 +52,7 @@ void* func_cholesky(void *val){
 
				   starpu_set_sched_ctx(sched_ctx);
			
 
				   for(i = 0; i < NSAMPLES; i++)
			
 
				     {
			
 
				-      rv->flops += run_cholesky_implicit(*sched_ctx, p->start, p->argc, p->argv, &timing, &barrier);
			
 
				+      rv->flops += run_cholesky_implicit(p->start, p->argc, p->argv, &timing, &barrier);
			
 
				       rv->avg_timing += timing;
			
 
				 
			
 
				     }
			
--- a/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -0,0 +1,229 @@
 
				+#include "sched_ctx_utils.h"
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+unsigned size1;
			
 
				+unsigned size2;
			
 
				+unsigned nblocks1;
			
 
				+unsigned nblocks2;
			
 
				+unsigned cpu1;
			
 
				+unsigned cpu2;
			
 
				+unsigned gpu;
			
 
				+unsigned gpu1;
			
 
				+unsigned gpu2;
			
 
				+
			
 
				+typedef struct {
			
 
				+	unsigned id;
			
 
				+	unsigned ctx;
			
 
				+	int the_other_ctx;
			
 
				+	int *procs;
			
 
				+	int nprocs;
			
 
				+	void (*bench)(unsigned, unsigned);
			
 
				+	unsigned size;
			
 
				+	unsigned nblocks;
			
 
				+} params;
			
 
				+
			
 
				+typedef struct {
			
 
				+	double flops;
			
 
				+	double avg_timing;
			
 
				+} retvals;
			
 
				+
			
 
				+#define NSAMPLES 3
			
 
				+int first = 1;
			
 
				+pthread_mutex_t mut;
			
 
				+retvals rv[2];
			
 
				+params p1, p2;
			
 
				+
			
 
				+pthread_key_t key;
			
 
				+
			
 
				+void init()
			
 
				+{
			
 
				+	size1 = 4*1024;
			
 
				+	size2 = 4*1024;
			
 
				+	nblocks1 = 16;
			
 
				+	nblocks2 = 16;
			
 
				+	cpu1 = 0;
			
 
				+	cpu2 = 0;
			
 
				+	gpu = 0;
			
 
				+	gpu1 = 0;
			
 
				+	gpu2 = 0;
			
 
				+
			
 
				+	rv[0].flops = 0.0;
			
 
				+	rv[1].flops = 0.0;
			
 
				+	rv[1].avg_timing = 0.0;
			
 
				+	rv[1].avg_timing = 0.0;
			
 
				+
			
 
				+	p1.ctx = 0;
			
 
				+	p2.ctx = 0;
			
 
				+
			
 
				+	p1.id = 0;
			
 
				+	p2.id = 1;
			
 
				+	pthread_key_create(&key, NULL);
			
 
				+}
			
 
				+
			
 
				+void update_sched_ctx_timing_results(double flops, double avg_timing)
			
 
				+{
			
 
				+	unsigned *id = pthread_getspecific(key);
			
 
				+	rv[*id].flops += flops;
			
 
				+	rv[*id].avg_timing += avg_timing;	
			
 
				+}
			
 
				+
			
 
				+void* start_bench(void *val){
			
 
				+	params *p = (params*)val;
			
 
				+	int i;
			
 
				+
			
 
				+	pthread_setspecific(key, &p->id);
			
 
				+
			
 
				+	if(p->ctx != 0)
			
 
				+		starpu_set_sched_ctx(&p->ctx);
			
 
				+
			
 
				+	for(i = 0; i < NSAMPLES; i++)
			
 
				+		p->bench(p->size, p->nblocks);
			
 
				+
			
 
				+	if(p->ctx != 0)
			
 
				+	{
			
 
				+		pthread_mutex_lock(&mut);
			
 
				+		if(first){
			
 
				+			starpu_delete_sched_ctx(p->ctx, p->the_other_ctx);
			
 
				+		}
			
 
				+		
			
 
				+		first = 0;
			
 
				+		pthread_mutex_unlock(&mut);
			
 
				+	}
			
 
				+
			
 
				+	rv[p->id].flops /= NSAMPLES;
			
 
				+	rv[p->id].avg_timing /= NSAMPLES;
			
 
				+}
			
 
				+
			
 
				+void start_2benchs(void (*bench)(unsigned, unsigned))
			
 
				+{
			
 
				+	p1.bench = bench;
			
 
				+	p1.size = size1;
			
 
				+	p1.nblocks = nblocks1;
			
 
				+	
			
 
				+	p2.bench = bench;
			
 
				+	p2.size = size2;
			
 
				+	p2.nblocks = nblocks2;
			
 
				+	
			
 
				+	pthread_t tid[2];
			
 
				+	pthread_mutex_init(&mut, NULL);
			
 
				+
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
			
 
				+	pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
			
 
				+ 
			
 
				+	pthread_join(tid[0], NULL);
			
 
				+	pthread_join(tid[1], NULL);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	pthread_mutex_destroy(&mut);
			
 
				+  
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	timing /= 1000000;
			
 
				+
			
 
				+	printf("%2.2f %2.2f ", rv[0].flops, rv[0].flops);
			
 
				+	printf("%2.2f %2.2f %2.2f\n", rv[1].avg_timing, rv[2].avg_timing, timing);
			
 
				+
			
 
				+}
			
 
				+void construct_contexts(void (*bench)(unsigned, unsigned))
			
 
				+{
			
 
				+	int nprocs1 = cpu1 + gpu + gpu1;
			
 
				+	int nprocs2 = cpu2 + gpu + gpu2;
			
 
				+	unsigned n_all_gpus = gpu + gpu1 + gpu2;
			
 
				+
			
 
				+
			
 
				+	int procs[nprocs1];
			
 
				+	int i;
			
 
				+	int k = 0;
			
 
				+
			
 
				+	for(i = 0; i < gpu; i++)
			
 
				+		procs[k++] = i;
			
 
				+
			
 
				+	for(i = gpu; i < gpu + gpu1; i++)
			
 
				+		procs[k++] = i;
			
 
				+
			
 
				+
			
 
				+	for(i = n_all_gpus; i < n_all_gpus + cpu1; i++)
			
 
				+		procs[k++] = i;
			
 
				+
			
 
				+
			
 
				+	p1.ctx = starpu_create_sched_ctx("heft", procs, nprocs1, "sched_ctx1");
			
 
				+	p2.the_other_ctx = (int)p1.ctx;
			
 
				+	p1.procs = procs;
			
 
				+	p1.nprocs = nprocs1;
			
 
				+	int procs2[nprocs2];
			
 
				+
			
 
				+	k = 0;
			
 
				+
			
 
				+	for(i = 0; i < gpu; i++)
			
 
				+		procs2[k++] = i;
			
 
				+
			
 
				+	for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++)
			
 
				+		procs2[k++] = i;
			
 
				+
			
 
				+	for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++)
			
 
				+		procs2[k++] = i;
			
 
				+
			
 
				+	p2.ctx = starpu_create_sched_ctx("prio", procs2, nprocs2, "sched_ctx2");
			
 
				+	p1.the_other_ctx = (int)p2.ctx;
			
 
				+	p2.procs = procs2;
			
 
				+	p2.nprocs = nprocs2;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void parse_args_ctx(int argc, char **argv)
			
 
				+{
			
 
				+	init();
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-size1") == 0) {
			
 
				+			char *argptr;
			
 
				+			size1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks1") == 0) {
			
 
				+			char *argptr;
			
 
				+			nblocks1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+		
			
 
				+		if (strcmp(argv[i], "-size2") == 0) {
			
 
				+			char *argptr;
			
 
				+			size2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks2") == 0) {
			
 
				+			char *argptr;
			
 
				+			nblocks2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-cpu1") == 0) {
			
 
				+			char *argptr;
			
 
				+			cpu1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-cpu2") == 0) {
			
 
				+			char *argptr;
			
 
				+			cpu2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-gpu") == 0) {
			
 
				+			char *argptr;
			
 
				+			gpu = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-gpu1") == 0) {
			
 
				+			char *argptr;
			
 
				+			gpu1 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+
			
 
				+		if (strcmp(argv[i], "-gpu2") == 0) {
			
 
				+			char *argptr;
			
 
				+			gpu2 = strtol(argv[++i], &argptr, 10);
			
 
				+		}    
			
 
				+	}
			
 
				+}
			
 
				+
			
--- a/examples/sched_ctx_utils/sched_ctx_utils.h
+++ b/examples/sched_ctx_utils/sched_ctx_utils.h
@@ -0,0 +1,10 @@
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+void parse_args_ctx(int argc, char **argv);
			
 
				+void update_sched_ctx_timing_results(double gflops, double timing);
			
 
				+void construct_contexts(void (*bench)(unsigned size, unsigned nblocks));
			
 
				+void start_2benchs(void (*bench)(unsigned size, unsigned nblocks));
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -320,8 +320,7 @@ void starpu_add_workers_to_sched_ctx(int *workerids, int nworkers,
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-static void _starpu_remove_workers_from_sched_ctx(int *workerids, int nworkers_to_remove, 
			
 
				-					  struct starpu_sched_ctx *sched_ctx)
			
 
				+static void _starpu_remove_workers_from_sched_ctx(int *workerids, int nworkers_to_remove, struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				   	struct starpu_machine_config_s *config = (struct starpu_machine_config_s *)_starpu_get_machine_config();
			
 
				 	int nworkers = config->topology.nworkers;