Просмотр исходного кода

Modified example in order to avoid duplication of cholesky code

Andra Hugo лет назад: 14
Родитель
Сommit
ef5bbf9eb0

+ 3 - 0
examples/Makefile.am

@@ -55,6 +55,7 @@ EXTRA_DIST = 					\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_kernels.c			\
 	lu/lu_example.c				\
+	sched_ctx_utils/sched_ctx_utils.c		\
 	cholesky_2ctxs/cholesky_2ctxs.c			\
 	cholesky_no_ctxs/cholesky_no_ctxs.c			\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
@@ -118,6 +119,7 @@ noinst_HEADERS = 				\
 	lu/complex_double.h			\
 	lu/blas_complex.h			\
 	cholesky/cholesky.h			\
+	sched_ctx_utils/sched_ctx_utils.h	\
 	common/blas_model.h			\
 	common/blas.h				\
 	mult/simple.h				\
@@ -435,6 +437,7 @@ cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
+	sched_ctx_utils/sched_ctx_utils.c	\
 	common/blas.c
 
 cholesky_cholesky_implicit_LDADD =		\

+ 22 - 11
examples/cholesky/cholesky.h

@@ -61,6 +61,8 @@ static unsigned nbigblocks = 8;
 static unsigned pinned = 0;
 static unsigned noprio = 0;
 static unsigned check = 0;
+static unsigned with_ctxs = 0;
+static unsigned with_noctxs = 0;
 
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
@@ -80,37 +82,46 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 {
 	int i;
 	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-with_ctxs") == 0) {
+			with_ctxs = 1;
+			break;
+		}
+		if (strcmp(argv[i], "-with_noctxs") == 0) {
+			with_noctxs = 1;
+			break;
+		}
+
 		if (strcmp(argv[i], "-size") == 0) {
-		        char *argptr;
+			char *argptr;
 			size = strtol(argv[++i], &argptr, 10);
 		}
-
+		
 		if (strcmp(argv[i], "-nblocks") == 0) {
-		        char *argptr;
+			char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
-
+		
 		if (strcmp(argv[i], "-nbigblocks") == 0) {
-		        char *argptr;
+			char *argptr;
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 		}
-
+		
 		if (strcmp(argv[i], "-pin") == 0) {
 			pinned = 1;
 		}
-
+		
 		if (strcmp(argv[i], "-no-prio") == 0) {
 			noprio = 1;
 		}
-
+		
 		if (strcmp(argv[i], "-check") == 0) {
 			check = 1;
 		}
-
+		
 		if (strcmp(argv[i], "-h") == 0) {
 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
-		}
-	}
+		}	
+	}	
 }
 
 #endif /* __DW_CHOLESKY_H__ */

+ 40 - 16
examples/cholesky/cholesky_implicit.c

@@ -17,7 +17,7 @@
  */
 
 #include "cholesky.h"
-
+#include "../sched_ctx_utils/sched_ctx_utils.h"
 /*
  *	Create the codelets
  */
@@ -126,13 +126,20 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	FPRINTF(stderr, "Computation took (in ms)\n");
-	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	unsigned long n = starpu_matrix_get_nx(dataA);
 
 	double flop = (1.0f*n*n*n)/3.0f;
-	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+	if(with_ctxs || with_noctxs)
+		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
+	else
+	{
+		FPRINTF(stderr, "Computation took (in ms)\n");
+		FPRINTF(stdout, "%2.2f\n", timing/1000);
+	
+		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
 }
 
 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
@@ -158,19 +165,8 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	_cholesky(dataA, nblocks);
 }
 
-int main(int argc, char **argv)
+static void execute_cholesky(unsigned size, unsigned nblocks)
 {
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
-	parse_args(argc, argv);
-
-	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
 	float *mat;
 	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
 
@@ -274,6 +270,34 @@ int main(int argc, char **argv)
 	        }
 	}
 
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	if(with_ctxs || with_noctxs)
+		parse_args_ctx(argc, argv);
+
+	starpu_init(NULL);
+
+	starpu_helper_cublas_init();
+
+	if(with_ctxs)
+	{
+		construct_contexts(execute_cholesky);
+		start_2benchs(execute_cholesky);
+	}
+	else if(with_noctxs)
+		start_2benchs(execute_cholesky);
+	else
+		execute_cholesky(size, nblocks);
+
 	starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 

+ 1 - 1
examples/cholesky_2ctxs/cholesky/cholesky.h

@@ -71,7 +71,7 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
 #endif
 
-double run_cholesky_implicit(unsigned sched_ctx, int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier);
+double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier);
 
 extern struct starpu_perfmodel_t chol_model_11;
 extern struct starpu_perfmodel_t chol_model_21;

+ 5 - 6
examples/cholesky_2ctxs/cholesky/cholesky_implicit.c

@@ -70,7 +70,7 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 	cl22.type = STARPU_SPMD;
 }
 
-static double _cholesky(starpu_data_handle dataA, unsigned nblocks, unsigned sched_ctx, double *timing)
+static double _cholesky(starpu_data_handle dataA, unsigned nblocks, double *timing)
 {
 	struct timeval start;
 	struct timeval end;
@@ -137,7 +137,7 @@ static double _cholesky(starpu_data_handle dataA, unsigned nblocks, unsigned sch
 	return gflops;
 }
 
-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned sched_ctx, double *timing)
+static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, double *timing)
 {
 	starpu_data_handle dataA;
 
@@ -156,12 +156,12 @@ static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks
 	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
-	double gflops = _cholesky(dataA, nblocks, sched_ctx, timing);
+	double gflops = _cholesky(dataA, nblocks, timing);
 	starpu_data_unregister(dataA);
 	return gflops;
 }
 
-double run_cholesky_implicit(unsigned sched_ctx, int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier)
+double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier)
 {
 	/* create a simple definite positive symetric matrix example
 	 *
@@ -208,8 +208,7 @@ double run_cholesky_implicit(unsigned sched_ctx, int start, int argc, char **arg
 		printf("\n");
 	}
 #endif
-//	starpu_set_sched_ctx(&sched_ctx);
-	double gflops = cholesky(mat, size, size, nblocks, sched_ctx, timing);
+	double gflops = cholesky(mat, size, size, nblocks, timing);
 
 #ifdef PRINT_OUTPUT
 	printf("Results :\n");

+ 1 - 1
examples/cholesky_2ctxs/cholesky_2ctxs.c

@@ -52,7 +52,7 @@ void* func_cholesky(void *val){
   starpu_set_sched_ctx(sched_ctx);
   for(i = 0; i < NSAMPLES; i++)
     {
-      rv->flops += run_cholesky_implicit(*sched_ctx, p->start, p->argc, p->argv, &timing, &barrier);
+      rv->flops += run_cholesky_implicit(p->start, p->argc, p->argv, &timing, &barrier);
       rv->avg_timing += timing;
 
     }

+ 229 - 0
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -0,0 +1,229 @@
+#include "sched_ctx_utils.h"
+#include <starpu.h>
+
+unsigned size1;
+unsigned size2;
+unsigned nblocks1;
+unsigned nblocks2;
+unsigned cpu1;
+unsigned cpu2;
+unsigned gpu;
+unsigned gpu1;
+unsigned gpu2;
+
+typedef struct {
+	unsigned id;
+	unsigned ctx;
+	int the_other_ctx;
+	int *procs;
+	int nprocs;
+	void (*bench)(unsigned, unsigned);
+	unsigned size;
+	unsigned nblocks;
+} params;
+
+typedef struct {
+	double flops;
+	double avg_timing;
+} retvals;
+
+#define NSAMPLES 3
+int first = 1;
+pthread_mutex_t mut;
+retvals rv[2];
+params p1, p2;
+
+pthread_key_t key;
+
+void init()
+{
+	size1 = 4*1024;
+	size2 = 4*1024;
+	nblocks1 = 16;
+	nblocks2 = 16;
+	cpu1 = 0;
+	cpu2 = 0;
+	gpu = 0;
+	gpu1 = 0;
+	gpu2 = 0;
+
+	rv[0].flops = 0.0;
+	rv[1].flops = 0.0;
+	rv[1].avg_timing = 0.0;
+	rv[1].avg_timing = 0.0;
+
+	p1.ctx = 0;
+	p2.ctx = 0;
+
+	p1.id = 0;
+	p2.id = 1;
+	pthread_key_create(&key, NULL);
+}
+
+void update_sched_ctx_timing_results(double flops, double avg_timing)
+{
+	unsigned *id = pthread_getspecific(key);
+	rv[*id].flops += flops;
+	rv[*id].avg_timing += avg_timing;	
+}
+
+void* start_bench(void *val){
+	params *p = (params*)val;
+	int i;
+
+	pthread_setspecific(key, &p->id);
+
+	if(p->ctx != 0)
+		starpu_set_sched_ctx(&p->ctx);
+
+	for(i = 0; i < NSAMPLES; i++)
+		p->bench(p->size, p->nblocks);
+
+	if(p->ctx != 0)
+	{
+		pthread_mutex_lock(&mut);
+		if(first){
+			starpu_delete_sched_ctx(p->ctx, p->the_other_ctx);
+		}
+		
+		first = 0;
+		pthread_mutex_unlock(&mut);
+	}
+
+	rv[p->id].flops /= NSAMPLES;
+	rv[p->id].avg_timing /= NSAMPLES;
+}
+
+void start_2benchs(void (*bench)(unsigned, unsigned))
+{
+	p1.bench = bench;
+	p1.size = size1;
+	p1.nblocks = nblocks1;
+	
+	p2.bench = bench;
+	p2.size = size2;
+	p2.nblocks = nblocks2;
+	
+	pthread_t tid[2];
+	pthread_mutex_init(&mut, NULL);
+
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
+	pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
+ 
+	pthread_join(tid[0], NULL);
+	pthread_join(tid[1], NULL);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+  
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f %2.2f ", rv[0].flops, rv[0].flops);
+	printf("%2.2f %2.2f %2.2f\n", rv[1].avg_timing, rv[2].avg_timing, timing);
+
+}
+void construct_contexts(void (*bench)(unsigned, unsigned))
+{
+	int nprocs1 = cpu1 + gpu + gpu1;
+	int nprocs2 = cpu2 + gpu + gpu2;
+	unsigned n_all_gpus = gpu + gpu1 + gpu2;
+
+
+	int procs[nprocs1];
+	int i;
+	int k = 0;
+
+	for(i = 0; i < gpu; i++)
+		procs[k++] = i;
+
+	for(i = gpu; i < gpu + gpu1; i++)
+		procs[k++] = i;
+
+
+	for(i = n_all_gpus; i < n_all_gpus + cpu1; i++)
+		procs[k++] = i;
+
+
+	p1.ctx = starpu_create_sched_ctx("heft", procs, nprocs1, "sched_ctx1");
+	p2.the_other_ctx = (int)p1.ctx;
+	p1.procs = procs;
+	p1.nprocs = nprocs1;
+	int procs2[nprocs2];
+
+	k = 0;
+
+	for(i = 0; i < gpu; i++)
+		procs2[k++] = i;
+
+	for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++)
+		procs2[k++] = i;
+
+	for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++)
+		procs2[k++] = i;
+
+	p2.ctx = starpu_create_sched_ctx("prio", procs2, nprocs2, "sched_ctx2");
+	p1.the_other_ctx = (int)p2.ctx;
+	p2.procs = procs2;
+	p2.nprocs = nprocs2;
+}
+
+
+void parse_args_ctx(int argc, char **argv)
+{
+	init();
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size1") == 0) {
+			char *argptr;
+			size1 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks1") == 0) {
+			char *argptr;
+			nblocks1 = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-size2") == 0) {
+			char *argptr;
+			size2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks2") == 0) {
+			char *argptr;
+			nblocks2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-cpu1") == 0) {
+			char *argptr;
+			cpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-cpu2") == 0) {
+			char *argptr;
+			cpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu") == 0) {
+			char *argptr;
+			gpu = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu1") == 0) {
+			char *argptr;
+			gpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu2") == 0) {
+			char *argptr;
+			gpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+	}
+}
+

+ 10 - 0
examples/sched_ctx_utils/sched_ctx_utils.h

@@ -0,0 +1,10 @@
+#include <limits.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <stdlib.h>
+
+void parse_args_ctx(int argc, char **argv);
+void update_sched_ctx_timing_results(double gflops, double timing);
+void construct_contexts(void (*bench)(unsigned size, unsigned nblocks));
+void start_2benchs(void (*bench)(unsigned size, unsigned nblocks));

+ 1 - 2
src/core/sched_ctx.c

@@ -320,8 +320,7 @@ void starpu_add_workers_to_sched_ctx(int *workerids, int nworkers,
 	return;
 }
 
-static void _starpu_remove_workers_from_sched_ctx(int *workerids, int nworkers_to_remove, 
-					  struct starpu_sched_ctx *sched_ctx)
+static void _starpu_remove_workers_from_sched_ctx(int *workerids, int nworkers_to_remove, struct starpu_sched_ctx *sched_ctx)
 {
   	struct starpu_machine_config_s *config = (struct starpu_machine_config_s *)_starpu_get_machine_config();
 	int nworkers = config->topology.nworkers;