Selaa lähdekoodia

Fix cublas initialization/shutdown when using one thread per stream with multistream

Samuel Thibault 8 vuotta sitten
vanhempi
commit
52d97c6ae9
3 muutettua tiedostoa jossa 33 lisäystä ja 13 poistoa
  1. 4 5
      src/core/topology.c
  2. 2 1
      src/core/workers.h
  3. 27 7
      src/drivers/cuda/starpu_cublas.c

+ 4 - 5
src/core/topology.c

@@ -1220,7 +1220,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 	_starpu_initialize_workers_cuda_gpuid(config);
 
 	/* allow having one worker per stream */
-	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
+	topology->cuda_th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
 
 	unsigned cudagpu;
 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
@@ -1233,7 +1233,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 		for (i = 0; i < nworker_per_cuda; i++)
 		{
 			int worker_idx = worker_idx0 + i;
-			if(th_per_stream)
+			if(topology->cuda_th_per_stream)
 			{
 				/* Just one worker in the set */
 				config->workers[worker_idx].set = (struct _starpu_worker_set *)calloc(1, sizeof(struct _starpu_worker_set));
@@ -1454,7 +1454,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
 			unsigned cuda_busy_cpus = 0;
 #if defined(STARPU_USE_CUDA)
-			cuda_busy_cpus = th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : 
+			cuda_busy_cpus = topology->cuda_th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : 
 				topology->ncudagpus;
 #endif
 			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus 
@@ -1731,7 +1731,6 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
 	unsigned cuda_bindid[STARPU_MAXCUDADEVS];
-	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
 #endif
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 	unsigned opencl_init[STARPU_MAXOPENCLDEVS] = { };
@@ -1814,7 +1813,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				if (cuda_init[devid])
 				{
 					memory_node = cuda_memory_nodes[devid];
-					if (th_per_stream == 0)
+					if (config->topology.cuda_th_per_stream == 0)
 						workerarg->bindid = cuda_bindid[devid];
 					else
 						workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);

+ 2 - 1
src/core/workers.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2011, 2016  INRIA
  * Copyright (C) 2016  Uppsala University
@@ -243,6 +243,7 @@ struct _starpu_machine_topology
 	/* Actual number of CUDA GPUs used by StarPU. */
 	unsigned ncudagpus;
 	unsigned nworkerpercuda;
+	unsigned cuda_th_per_stream;
 
 	/* Actual number of OpenCL workers used by StarPU. */
 	unsigned nopenclgpus;

+ 27 - 7
src/drivers/cuda/starpu_cublas.c

@@ -18,32 +18,52 @@
 #include <starpu.h>
 #include <starpu_cuda.h>
 #include <common/config.h>
+#include <core/workers.h>
 
 #ifdef STARPU_USE_CUDA
 #include <cublas.h>
 
+static int cublas_initialized[STARPU_NMAXWORKERS];
+
+static unsigned get_idx(void) {
+	unsigned workerid = starpu_worker_get_id_check();
+	unsigned th_per_stream = _starpu_get_machine_config()->topology.cuda_th_per_stream;
+
+	if (th_per_stream)
+		return workerid;
+	else
+		return starpu_worker_get_devid(workerid);
+}
+
 static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 {
-	cublasStatus cublasst = cublasInit();
-	if (STARPU_UNLIKELY(cublasst))
-		STARPU_CUBLAS_REPORT_ERROR(cublasst);
+	unsigned idx = get_idx();
+	if (STARPU_ATOMIC_ADD(&cublas_initialized[idx], 1) == 1)
+	{
+		cublasStatus cublasst = cublasInit();
+		if (STARPU_UNLIKELY(cublasst))
+			STARPU_CUBLAS_REPORT_ERROR(cublasst);
+	}
+}
 
+static void set_cublas_stream_func(void *args STARPU_ATTRIBUTE_UNUSED)
+{
 	cublasSetKernelStream(starpu_cuda_get_local_stream());
 }
 
 static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 {
-	cublasShutdown();
+	unsigned idx = get_idx();
+	if (STARPU_ATOMIC_ADD(&cublas_initialized[idx], -1) == 0)
+		cublasShutdown();
 }
 #endif
 
-#ifdef STARPU_DEVEL
-#warning FIXME should actually be done once per driver thread only, otherwise shutdown crashes
-#endif
 void starpu_cublas_init(void)
 {
 #ifdef STARPU_USE_CUDA
 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
+	starpu_execute_on_each_worker(set_cublas_stream_func, NULL, STARPU_CUDA);
 #endif
 }