13 lat temu · 438839d828
--- a/ChangeLog
+++ b/ChangeLog
@@ -28,6 +28,7 @@ New features:
 
				     starpu_data_invalidate_submit
			
 
				   * New functionnality to wrapper starpu_insert_task to pass a array of
			
 
				 	data_handles via the parameter STARPU_DATA_ARRAY
			
 
				+  * Enable GPU-GPU direct transfers.
			
 
				 
			
 
				 Changes:
			
 
				   * The FxT code can now be used on systems other than Linux.
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -86,6 +86,41 @@ static hwloc_topology_t hwtopology;
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+
			
 
				+static void initialize_cuda(unsigned ncuda)
			
 
				+{
			
 
				+	unsigned dev;
			
 
				+
			
 
				+	for (dev = 0; dev < ncuda; dev++) {
			
 
				+		cudaError_t cures;
			
 
				+
			
 
				+		_STARPU_DISP("Initializing CUDA%d...\n", dev);
			
 
				+
			
 
				+		/* Initiliaze CUDA context on the device */
			
 
				+		cures = cudaSetDevice(dev);
			
 
				+		if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		unsigned dev2;
			
 
				+		for (dev2 = 0; dev2 < ncuda; dev2++) {
			
 
				+			if (dev2 != dev) {
			
 
				+				int can;
			
 
				+				cures = cudaDeviceCanAccessPeer(&can, dev, dev2);
			
 
				+				if (!cures && can) {
			
 
				+					cures = cudaDeviceEnablePeerAccess(dev2, 0);
			
 
				+					if (!cures)
			
 
				+						_STARPU_DISP("GPU-Direct %d -> %d\n", dev2, dev);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				+		/* hack to force the initialization */
			
 
				+		cures = cudaFree(0);
			
 
				+		if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
			
 
				 {
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
@@ -98,12 +133,6 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				 
			
 
				-	/* hack to force the initialization */
			
 
				-	cudaFree(0);
			
 
				-
			
 
				-	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				-
			
 
				         /* Get the maximum size which can be allocated on the device */
			
 
				 	struct cudaDeviceProp prop;
			
 
				 	cudaError_t cures;
			
@@ -537,12 +566,7 @@ static void benchmark_all_gpu_devices(void)
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	ncuda = _starpu_get_cuda_device_count();
			
 
				-	for (i = 0; i < ncuda; i++)
			
 
				-	{
			
 
				-		_STARPU_DISP("CUDA %d...\n", i);
			
 
				-		/* measure bandwidth between Host and Device i */
			
 
				-		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
			
 
				-	}
			
 
				+	initialize_cuda(ncuda);
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 	for (i = 0; i < ncuda; i++)
			
 
				 		for (j = 0; j < ncuda; j++)
			
@@ -553,6 +577,12 @@ static void benchmark_all_gpu_devices(void)
 
				 				measure_bandwidth_between_dev_and_dev_cuda(i, j);
			
 
				 			}
			
 
				 #endif
			
 
				+	for (i = 0; i < ncuda; i++)
			
 
				+	{
			
 
				+		_STARPU_DISP("CUDA %d...\n", i);
			
 
				+		/* measure bandwidth between Host and Device i */
			
 
				+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
			
 
				+	}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         nopencl = _starpu_opencl_get_device_count();
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -138,10 +138,26 @@ done:
 
				 static void init_context(int devid)
			
 
				 {
			
 
				 	cudaError_t cures;
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				+	int workerid;
			
 
				 
			
 
				 	starpu_cuda_set_device(devid);
			
 
				 
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	int nworkers = starpu_worker_get_count();
			
 
				+	for (workerid = 0; workerid < nworkers; workerid++) {
			
 
				+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+		if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid) {
			
 
				+			int can;
			
 
				+			cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
			
 
				+			if (!cures && can) {
			
 
				+				cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
			
 
				+				if (cures)
			
 
				+					_STARPU_DEBUG("GPU-Direct %d -> %d\n", worker->devid, devid);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	/* force CUDA to initialize the context for real */
			
 
				 	cures = cudaFree(0);
			
 
				 	if (STARPU_UNLIKELY(cures)) {
			
@@ -164,6 +180,8 @@ static void init_context(int devid)
 
				 
			
 
				 	limit_gpu_mem_if_needed(devid);
			
 
				 
			
 
				+	workerid = starpu_worker_get_id();
			
 
				+
			
 
				 	cures = cudaStreamCreate(&streams[workerid]);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);