浏览代码

Make Cholesky use asynchronous CUDA tasks. Also factorize the codelet structures along the way.

Samuel Thibault 11 年之前
父节点
当前提交
87bf731e40

+ 9 - 1
examples/cholesky/cholesky.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -128,6 +128,14 @@ static unsigned with_noctxs = 0;
 static unsigned chole1 = 0;
 static unsigned chole2 = 0;
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
+struct starpu_codelet cl11;
+struct starpu_codelet cl21;
+struct starpu_codelet cl22;
+
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
 void chol_cpu_codelet_update_u22(void **, void *);

+ 1 - 38
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -18,10 +18,6 @@
 
 #include "cholesky.h"
 
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
 /*
  *	Some useful functions
  */
@@ -40,17 +36,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static struct starpu_codelet cl11 =
-{
-	.modes = { STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
 static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k, unsigned reclevel)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
@@ -77,17 +62,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static struct starpu_codelet cl21 =
-{
-	.modes = { STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
 static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
 {
 	int ret;
@@ -123,17 +97,6 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 	return ret;
 }
 
-static struct starpu_codelet cl22 =
-{
-	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
 static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
 {
 	int ret;

+ 0 - 49
examples/cholesky/cholesky_implicit.c

@@ -18,55 +18,6 @@
 
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
-/*
- *	Create the codelets
- */
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
-static struct starpu_codelet cl11 =
-{
-	.type = STARPU_SEQ,
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#elif defined(STARPU_SIMGRID)
-	.cuda_funcs = {(void*)1, NULL},
-#endif
-	.nbuffers = 1,
-	.modes = {STARPU_RW},
-	.model = &chol_model_11
-};
-
-static struct starpu_codelet cl21 =
-{
-	.type = STARPU_SEQ,
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#elif defined(STARPU_SIMGRID)
-	.cuda_funcs = {(void*)1, NULL},
-#endif
-	.nbuffers = 2,
-	.modes = {STARPU_R, STARPU_RW},
-	.model = &chol_model_21
-};
-
-static struct starpu_codelet cl22 =
-{
-	.type = STARPU_SEQ,
-	.max_parallelism = INT_MAX,
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#elif defined(STARPU_SIMGRID)
-	.cuda_funcs = {(void*)1, NULL},
-#endif
-	.nbuffers = 3,
-	.modes = {STARPU_R, STARPU_R, STARPU_RW | STARPU_COMMUTE},
-	.model = &chol_model_22
-};
 
 /*
  *	code to bootstrap the factorization

+ 49 - 4
examples/cholesky/cholesky_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -74,7 +74,6 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 		cublasSgemm('n', 't', dy, dx, dz, 
 				-1.0f, left, ld21, right, ld12, 
 				 1.0f, center, ld22);
-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 
 	}
@@ -119,7 +118,6 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 #ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			break;
 #endif
 		default:
@@ -193,7 +191,6 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}
-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			STARPU_ASSERT(!cures);
 			}
 #else
@@ -246,3 +243,51 @@ void chol_cublas_codelet_update_u11(void *descr[], void *_args)
 	chol_common_codelet_update_u11(descr, 1, _args);
 }
 #endif/* STARPU_USE_CUDA */
+
+struct starpu_codelet cl11 =
+{
+	.type = STARPU_SEQ,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
+#endif
+#ifdef STARPU_HAVE_MAGMA
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.model = &chol_model_11
+};
+
+struct starpu_codelet cl21 =
+{
+	.type = STARPU_SEQ,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
+#endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 2,
+	.modes = { STARPU_R, STARPU_RW },
+	.model = &chol_model_21
+};
+
+struct starpu_codelet cl22 =
+{
+	.type = STARPU_SEQ,
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
+#endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 3,
+	.modes = { STARPU_R, STARPU_R, STARPU_RW | STARPU_COMMUTE },
+	.model = &chol_model_22
+};

+ 1 - 38
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -19,10 +19,6 @@
 #include "cholesky.h"
 #include <starpu_perfmodel.h>
 
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
 /*
  *	Some useful functions
  */
@@ -41,17 +37,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static struct starpu_codelet cl11 =
-{
-	.modes = { STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
 static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
@@ -79,17 +64,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static struct starpu_codelet cl21 =
-{
-	.modes = { STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
 static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 {
 	struct starpu_task *task = create_task(TAG21(k, j));
@@ -127,17 +101,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 
 }
 
-static struct starpu_codelet cl22 =
-{
-	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
 static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
 {
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */

+ 1 - 38
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,10 +17,6 @@
 
 #include "cholesky.h"
 
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
 /* A [ y ] [ x ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
@@ -43,17 +39,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static struct starpu_codelet cl11 =
-{
-	.modes = { STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
 static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
@@ -80,17 +65,6 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 	return task;
 }
 
-static struct starpu_codelet cl21 =
-{
-	.modes = { STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
 static int create_task_21(unsigned k, unsigned j)
 {
 	int ret;
@@ -126,17 +100,6 @@ static int create_task_21(unsigned k, unsigned j)
 	return ret;
 }
 
-static struct starpu_codelet cl22 =
-{
-	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
 static int create_task_22(unsigned k, unsigned i, unsigned j)
 {
 	int ret;