Pārlūkot izejas kodu

Enable GPU-GPU direct transfers

Samuel Thibault 12 gadi atpakaļ
vecāks
revīzija
438839d828
3 mainītis faili ar 62 papildinājumiem un 13 dzēšanām
  1. 1 0
      ChangeLog
  2. 42 12
      src/core/perfmodel/perfmodel_bus.c
  3. 19 1
      src/drivers/cuda/driver_cuda.c

+ 1 - 0
ChangeLog

@@ -28,6 +28,7 @@ New features:
     starpu_data_invalidate_submit
   * New functionnality to wrapper starpu_insert_task to pass a array of
 	data_handles via the parameter STARPU_DATA_ARRAY
+  * Enable GPU-GPU direct transfers.
 
 Changes:
   * The FxT code can now be used on systems other than Linux.

+ 42 - 12
src/core/perfmodel/perfmodel_bus.c

@@ -86,6 +86,41 @@ static hwloc_topology_t hwtopology;
 #endif
 
 #ifdef STARPU_USE_CUDA
+
+static void initialize_cuda(unsigned ncuda)
+{
+	unsigned dev;
+
+	for (dev = 0; dev < ncuda; dev++) {
+		cudaError_t cures;
+
+		_STARPU_DISP("Initializing CUDA%d...\n", dev);
+
+		/* Initiliaze CUDA context on the device */
+		cures = cudaSetDevice(dev);
+		if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
+
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		unsigned dev2;
+		for (dev2 = 0; dev2 < ncuda; dev2++) {
+			if (dev2 != dev) {
+				int can;
+				cures = cudaDeviceCanAccessPeer(&can, dev, dev2);
+				if (!cures && can) {
+					cures = cudaDeviceEnablePeerAccess(dev2, 0);
+					if (!cures)
+						_STARPU_DISP("GPU-Direct %d -> %d\n", dev2, dev);
+				}
+			}
+		}
+#endif
+
+		/* hack to force the initialization */
+		cures = cudaFree(0);
+		if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
+	}
+}
+
 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
@@ -98,12 +133,6 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 
-	/* hack to force the initialization */
-	cudaFree(0);
-
-	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
-
         /* Get the maximum size which can be allocated on the device */
 	struct cudaDeviceProp prop;
 	cudaError_t cures;
@@ -537,12 +566,7 @@ static void benchmark_all_gpu_devices(void)
 
 #ifdef STARPU_USE_CUDA
 	ncuda = _starpu_get_cuda_device_count();
-	for (i = 0; i < ncuda; i++)
-	{
-		_STARPU_DISP("CUDA %d...\n", i);
-		/* measure bandwidth between Host and Device i */
-		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
-	}
+	initialize_cuda(ncuda);
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	for (i = 0; i < ncuda; i++)
 		for (j = 0; j < ncuda; j++)
@@ -553,6 +577,12 @@ static void benchmark_all_gpu_devices(void)
 				measure_bandwidth_between_dev_and_dev_cuda(i, j);
 			}
 #endif
+	for (i = 0; i < ncuda; i++)
+	{
+		_STARPU_DISP("CUDA %d...\n", i);
+		/* measure bandwidth between Host and Device i */
+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
+	}
 #endif
 #ifdef STARPU_USE_OPENCL
         nopencl = _starpu_opencl_get_device_count();

+ 19 - 1
src/drivers/cuda/driver_cuda.c

@@ -138,10 +138,26 @@ done:
 static void init_context(int devid)
 {
 	cudaError_t cures;
-	int workerid = starpu_worker_get_id();
+	int workerid;
 
 	starpu_cuda_set_device(devid);
 
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	int nworkers = starpu_worker_get_count();
+	for (workerid = 0; workerid < nworkers; workerid++) {
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+		if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid) {
+			int can;
+			cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
+			if (!cures && can) {
+				cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
+				if (cures)
+					_STARPU_DEBUG("GPU-Direct %d -> %d\n", worker->devid, devid);
+			}
+		}
+	}
+#endif
+
 	/* force CUDA to initialize the context for real */
 	cures = cudaFree(0);
 	if (STARPU_UNLIKELY(cures)) {
@@ -164,6 +180,8 @@ static void init_context(int devid)
 
 	limit_gpu_mem_if_needed(devid);
 
+	workerid = starpu_worker_get_id();
+
 	cures = cudaStreamCreate(&streams[workerid]);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);