8 vuotta sitten · 52d97c6ae9
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -1220,7 +1220,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 	_starpu_initialize_workers_cuda_gpuid(config);
			
 
				 
			
 
				 	/* allow having one worker per stream */
			
 
				-	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
			
 
				+	topology->cuda_th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
			
 
				 
			
 
				 	unsigned cudagpu;
			
 
				 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
			
@@ -1233,7 +1233,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 		for (i = 0; i < nworker_per_cuda; i++)
			
 
				 		{
			
 
				 			int worker_idx = worker_idx0 + i;
			
 
				-			if(th_per_stream)
			
 
				+			if(topology->cuda_th_per_stream)
			
 
				 			{
			
 
				 				/* Just one worker in the set */
			
 
				 				config->workers[worker_idx].set = (struct _starpu_worker_set *)calloc(1, sizeof(struct _starpu_worker_set));
			
@@ -1454,7 +1454,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				 			unsigned cuda_busy_cpus = 0;
			
 
				 #if defined(STARPU_USE_CUDA)
			
 
				-			cuda_busy_cpus = th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : 
			
 
				+			cuda_busy_cpus = topology->cuda_th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : 
			
 
				 				topology->ncudagpus;
			
 
				 #endif
			
 
				 			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus 
			
@@ -1731,7 +1731,6 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
			
 
				 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
			
 
				 	unsigned cuda_bindid[STARPU_MAXCUDADEVS];
			
 
				-	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
			
 
				 #endif
			
 
				 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	unsigned opencl_init[STARPU_MAXOPENCLDEVS] = { };
			
@@ -1814,7 +1813,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				if (cuda_init[devid])
			
 
				 				{
			
 
				 					memory_node = cuda_memory_nodes[devid];
			
 
				-					if (th_per_stream == 0)
			
 
				+					if (config->topology.cuda_th_per_stream == 0)
			
 
				 						workerarg->bindid = cuda_bindid[devid];
			
 
				 					else
			
 
				 						workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011, 2016  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
@@ -243,6 +243,7 @@ struct _starpu_machine_topology
 
				 	/* Actual number of CUDA GPUs used by StarPU. */
			
 
				 	unsigned ncudagpus;
			
 
				 	unsigned nworkerpercuda;
			
 
				+	unsigned cuda_th_per_stream;
			
 
				 
			
 
				 	/* Actual number of OpenCL workers used by StarPU. */
			
 
				 	unsigned nopenclgpus;
			
--- a/src/drivers/cuda/starpu_cublas.c
+++ b/src/drivers/cuda/starpu_cublas.c
@@ -18,32 +18,52 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <common/config.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cublas.h>
			
 
				 
			
 
				+static int cublas_initialized[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+static unsigned get_idx(void) {
			
 
				+	unsigned workerid = starpu_worker_get_id_check();
			
 
				+	unsigned th_per_stream = _starpu_get_machine_config()->topology.cuda_th_per_stream;
			
 
				+
			
 
				+	if (th_per_stream)
			
 
				+		return workerid;
			
 
				+	else
			
 
				+		return starpu_worker_get_devid(workerid);
			
 
				+}
			
 
				+
			
 
				 static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	cublasStatus cublasst = cublasInit();
			
 
				-	if (STARPU_UNLIKELY(cublasst))
			
 
				-		STARPU_CUBLAS_REPORT_ERROR(cublasst);
			
 
				+	unsigned idx = get_idx();
			
 
				+	if (STARPU_ATOMIC_ADD(&cublas_initialized[idx], 1) == 1)
			
 
				+	{
			
 
				+		cublasStatus cublasst = cublasInit();
			
 
				+		if (STARPU_UNLIKELY(cublasst))
			
 
				+			STARPU_CUBLAS_REPORT_ERROR(cublasst);
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				+static void set_cublas_stream_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				 	cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	cublasShutdown();
			
 
				+	unsigned idx = get_idx();
			
 
				+	if (STARPU_ATOMIC_ADD(&cublas_initialized[idx], -1) == 0)
			
 
				+		cublasShutdown();
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning FIXME should actually be done once per driver thread only, otherwise shutdown crashes
			
 
				-#endif
			
 
				 void starpu_cublas_init(void)
			
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
			
 
				+	starpu_execute_on_each_worker(set_cublas_stream_func, NULL, STARPU_CUDA);
			
 
				 #endif
			
 
				 }