Browse Source

Fix OpenCL kernel submission with sizes that are not multiple of the local size

Samuel Thibault 4 years ago
parent
commit
89e6306374

+ 1 - 0
doc/doxygen/chapters/code/vector_scal_opencl.c

@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
         err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (local > global) local=global;
+        else global = (global + local-1) / local * local;
 
         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 2 - 0
examples/axpy/axpy_opencl.c

@@ -60,6 +60,8 @@ void axpy_opencl(void *buffers[], void *_args)
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -74,6 +74,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 2 - 0
examples/basic_examples/multiformat_opencl.c

@@ -68,6 +68,8 @@ void multiformat_scal_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 1 - 0
examples/basic_examples/vector_scal_opencl.c

@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
+                else global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 2 - 0
examples/filters/custom_mf/conversion_opencl.c

@@ -76,6 +76,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(
 				queue,

+ 2 - 0
examples/filters/custom_mf/custom_opencl.c

@@ -75,6 +75,8 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(
 				queue,

+ 2 - 0
examples/interface/complex_kernels_opencl.c

@@ -64,6 +64,8 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)

+ 2 - 14
examples/reductions/dot_product.c

@@ -185,18 +185,12 @@ void redux_opencl_func(void *buffers[], void *args)
 
 	{
 		size_t global=1;
-		size_t local;
+                size_t local=1;
                 size_t s;
                 cl_device_id device;
 
                 starpu_opencl_get_device(devid, &device);
 
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
-
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
@@ -306,18 +300,12 @@ void dot_opencl_func(void *buffers[], void *cl_arg)
 
 	{
 		size_t global=1;
-		size_t local;
+                size_t local=1;
                 size_t s;
                 cl_device_id device;
 
                 starpu_opencl_get_device(devid, &device);
 
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
-
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);

+ 1 - 0
examples/reductions/dot_product_opencl_kernels.cl

@@ -31,6 +31,7 @@ __kernel void _dot_opencl(__global float *x,
 			  __global DOT_TYPE *dot,
 			  unsigned n)
 {
+/* FIXME: real parallel implementation */
 	unsigned i;
 	__local double tmp;
 	tmp = 0.0;

+ 2 - 0
tests/datawizard/interfaces/bcsr/bcsr_opencl.c

@@ -95,6 +95,8 @@ test_bcsr_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/coo/coo_opencl.c

@@ -93,6 +93,8 @@ test_coo_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/csr/csr_opencl.c

@@ -93,6 +93,8 @@ test_csr_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/matrix/matrix_opencl.c

@@ -92,6 +92,8 @@ void test_matrix_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c

@@ -84,6 +84,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_opencl.c

@@ -98,6 +98,8 @@ void test_multiformat_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 1 - 1
tests/datawizard/interfaces/tensor/tensor_opencl.c

@@ -87,7 +87,7 @@ test_tensor_opencl_func(void *buffers[], void *args)
 	}
 			
 	{
-		size_t global = nx * ny * nz * nt;
+                size_t global = 1;
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
 					     1,

+ 1 - 13
tests/datawizard/interfaces/variable/variable_opencl.c

@@ -73,24 +73,12 @@ void test_variable_opencl_func(void *buffers[], void *args)
 
 	{
 		size_t global = 1;
-		size_t local;
+                size_t local = 1;
                 size_t s;
                 cl_device_id device;
 
                 starpu_opencl_get_device(devid, &device);
 
-                err = clGetKernelWorkGroupInfo (kernel,
-						device,
-						CL_KERNEL_WORK_GROUP_SIZE,
-						sizeof(local),
-						&local,
-						&s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-
-                if (local > global)
-			local = global;
-
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,
 					1,

+ 2 - 0
tests/datawizard/interfaces/vector/vector_opencl.c

@@ -91,6 +91,8 @@ test_vector_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 1 - 0
tests/datawizard/scal.c

@@ -73,6 +73,7 @@ void scal_func_opencl(void *buffers[], void *cl_arg)
                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
+                else global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 2 - 0
tests/datawizard/scratch_opencl.c

@@ -73,6 +73,8 @@ void opencl_f(void *buffers[], void *args)
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)

+ 2 - 0
tests/perfmodels/opencl_memset.c

@@ -58,6 +58,8 @@ void memset_opencl(void *buffers[], void *args)
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)