Sfoglia il codice sorgente

core/perfmodel/perfmodel_bus.c: check maximum size available on CUDA device

Nathalie Furmento 14 anni fa
parent
commit
23793fca80
1 ha cambiato i file con 15 aggiunte e 7 eliminazioni
  1. 15 7
      src/core/perfmodel/perfmodel_bus.c

+ 15 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -27,6 +27,7 @@
 #include <math.h>
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <common/config.h>
 #include <core/workers.h>
@@ -65,6 +66,7 @@ static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+static size_t cuda_size = SIZE;
 #endif
 #ifdef STARPU_USE_OPENCL
 static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
@@ -98,10 +100,16 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 
+        /* Get the maximum size which can be allocated on the device */
+	struct cudaDeviceProp prop;
+	cudaError_t cures;
+	cures = cudaGetDeviceProperties(&prop, dev);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
+        if (cuda_size > prop.totalGlobalMem) cuda_size = prop.totalGlobalMem;
 
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
-	cudaMalloc((void **)&d_buffer, SIZE);
+	cudaMalloc((void **)&d_buffer, cuda_size);
 	assert(d_buffer);
 
 	/* hack to avoid third party libs to rebind threads */
@@ -110,7 +118,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
-	cudaHostAlloc((void **)&h_buffer, SIZE, 0);
+	cudaHostAlloc((void **)&h_buffer, cuda_size, 0);
 	assert(h_buffer);
 
 	/* hack to avoid third party libs to rebind threads */
@@ -118,8 +126,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
 
 	/* Fill them */
-	memset(h_buffer, 0, SIZE);
-	cudaMemset(d_buffer, 0, SIZE);
+	memset(h_buffer, 0, cuda_size);
+	cudaMemset(d_buffer, 0, cuda_size);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -134,7 +142,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	{
-		cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
+		cudaMemcpy(d_buffer, h_buffer, cuda_size, cudaMemcpyHostToDevice);
 		cudaThreadSynchronize();
 	}
 	gettimeofday(&end, NULL);
@@ -146,7 +154,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	{
-		cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
+		cudaMemcpy(h_buffer, d_buffer, cuda_size, cudaMemcpyDeviceToHost);
 		cudaThreadSynchronize();
 	}
 	gettimeofday(&end, NULL);
@@ -882,7 +890,7 @@ static void write_bus_bandwidth_file_content(void)
 				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
                                 time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
 				timing =time_src_to_ram + time_ram_to_dst;
-				bandwidth = 1.0*SIZE/timing;
+				bandwidth = 1.0*cuda_size/timing;
 #endif
 #ifdef STARPU_USE_OPENCL
                                 if (src > ncuda)