Browse Source

Implement STARPU_LIMIT_GPU_MEM on both CUDA and OpenCL devices.

Cédric Augonnet 14 years ago
parent
commit
e3df8c20c8
2 changed files with 63 additions and 12 deletions
  1. 16 12
      src/drivers/cuda/driver_cuda.c
  2. 47 0
      src/drivers/opencl/driver_opencl.c

+ 16 - 12
src/drivers/cuda/driver_cuda.c

@@ -38,12 +38,11 @@ static char *wasted_memory[STARPU_NMAXWORKERS];
 static void limit_gpu_mem_if_needed(int devid)
 {
 	cudaError_t cures;
-	int workerid = starpu_worker_get_id();
 	int limit = starpu_get_env_number("STARPU_LIMIT_GPU_MEM");
 
 	if (limit == -1)
 	{
-		wasted_memory[workerid] = NULL;
+		wasted_memory[devid] = NULL;
 		return;
 	}
 
@@ -54,26 +53,31 @@ static void limit_gpu_mem_if_needed(int devid)
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 	size_t totalGlobalMem = prop.totalGlobalMem;
-	size_t to_waste = totalGlobalMem - (size_t)limit*1024*1024;
 
-	_STARPU_DEBUG("Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n", (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024), (size_t)(totalGlobalMem - to_waste)/(1024*1024));
+	/* How much memory to waste ? */
+	size_t to_waste = totalGlobalMem - (size_t)limit*1024*1024;
 
-	cures = cudaMalloc((void **)&wasted_memory[workerid], to_waste);
+	_STARPU_DEBUG("CUDA device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
+			devid, (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024),
+			(size_t)(totalGlobalMem - to_waste)/(1024*1024));
+	
+	/* Allocate a large buffer to waste memory and constraint the amount of available memory. */
+	cures = cudaMalloc((void **)&wasted_memory[devid], to_waste);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 
-static void unlimit_gpu_mem_if_needed(int workerid)
+static void unlimit_gpu_mem_if_needed(int devid)
 {
 	cudaError_t cures;
 
-	if (wasted_memory[workerid])
+	if (wasted_memory[devid])
 	{
-		cures = cudaFree(wasted_memory[workerid]);
+		cures = cudaFree(wasted_memory[devid]);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 
-		wasted_memory[workerid] = NULL;
+		wasted_memory[devid] = NULL;
 	}
 }
 
@@ -102,13 +106,13 @@ static void init_context(int devid)
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 
-static void deinit_context(int workerid)
+static void deinit_context(int workerid, int devid)
 {
 	cudaError_t cures;
 
 	cudaStreamDestroy(streams[workerid]);
 
-	unlimit_gpu_mem_if_needed(workerid);
+	unlimit_gpu_mem_if_needed(devid);
 
 	/* cleanup the runtime API internal stuffs (which CUBLAS is using) */
 	cures = cudaThreadExit();
@@ -311,7 +315,7 @@ void *_starpu_cuda_worker(void *arg)
 	 * coherency is not maintained anymore at that point ! */
 	_starpu_free_all_automatically_allocated_buffers(memnode);
 
-	deinit_context(args->workerid);
+	deinit_context(args->workerid, args->devid);
 
 	STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CUDA_KEY);
 

+ 47 - 0
src/drivers/opencl/driver_opencl.c

@@ -37,6 +37,49 @@ static cl_uint nb_devices = -1;
 static int init_done = 0;
 extern char *_starpu_opencl_program_dir;
 
+/* In case we want to cap the amount of memory available on the GPUs by the
+ * mean of the STARPU_LIMIT_GPU_MEM, we allocate a big buffer when the driver
+ * is launched. */
+static cl_mem wasted_memory[STARPU_MAXOPENCLDEVS];
+
+static void limit_gpu_mem_if_needed(int devid)
+{
+	cl_int err;
+
+	int limit = starpu_get_env_number("STARPU_LIMIT_GPU_MEM");
+	if (limit == -1)
+	{
+		wasted_memory[devid] = NULL;
+		return;
+	}
+
+	/* Request the size of the current device's memory */
+	cl_ulong totalGlobalMem;
+	clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(totalGlobalMem), &totalGlobalMem, NULL);
+
+	/* How much memory to waste ? */
+	size_t to_waste = (size_t)totalGlobalMem - (size_t)limit*1024*1024;
+
+	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
+			devid, (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024),
+			(size_t)(totalGlobalMem - to_waste)/(1024*1024));
+
+	/* Allocate a large buffer to waste memory and constraint the amount of available memory. */
+	wasted_memory[devid] = clCreateBuffer(contexts[devid], CL_MEM_READ_WRITE, to_waste, NULL, &err);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+}
+
+static void unlimit_gpu_mem_if_needed(int devid)
+{
+	if (wasted_memory[devid])
+	{
+		clReleaseMemObject(wasted_memory[devid]);
+		wasted_memory[devid] = NULL;
+	}
+}
+
+
 void starpu_opencl_get_context(int devid, cl_context *context)
 {
         *context = contexts[devid];
@@ -69,6 +112,8 @@ int _starpu_opencl_init_context(int devid)
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	PTHREAD_MUTEX_UNLOCK(&big_lock);
 
+	limit_gpu_mem_if_needed(devid);
+
 	return EXIT_SUCCESS;
 }
 
@@ -80,6 +125,8 @@ int _starpu_opencl_deinit_context(int devid)
 
         _STARPU_DEBUG("De-initialising context for dev %d\n", devid);
 
+	unlimit_gpu_mem_if_needed(devid);
+
         err = clReleaseContext(contexts[devid]);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);