浏览代码

Rework block and tensor OpenCL kernels

The amd implementation doesn't seem to be coping well with so many for
loops...
Samuel Thibault 4 年之前
父节点
当前提交
485327669e

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -60,8 +60,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 
 	{
-		size_t global=nx*ny*nz;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		size_t global[3]={nx,ny,nz};
+		err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	starpu_opencl_release_kernel(kernel);

+ 13 - 10
examples/filters/fblock_opencl_kernel.cl

@@ -18,14 +18,17 @@
 
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 {
-        int i, j, k;
-        block = (__global int*) ((__global char *)block + offset);
-        for(k=0; k<nz ; k++)
-	{
-                for(j=0; j<ny ; j++)
-		{
-                        for(i=0; i<nx ; i++)
-                                block[(k*ldz)+(j*ldy)+i] = factor;
-                }
-        }
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
+		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	block = (__global int*) ((__global char *)block + offset);
+	int i = idz*ldz + idy*ldy + idx;
+	block[i] = factor;
 }

+ 3 - 3
tests/datawizard/interfaces/block/block_opencl.c

@@ -83,12 +83,12 @@ test_block_opencl_func(void *buffers[], void *args)
 	}
 			
 	{
-		size_t global = nx * ny * nz;
+		size_t global[3] = {nx, ny, nz};
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     0,
 					     NULL,

+ 15 - 23
tests/datawizard/interfaces/block/block_opencl_kernel.cl

@@ -18,29 +18,21 @@ __kernel void block_opencl(__global int *block,
 			   int ldy, int ldz,
 			   int factor, __global int *err)
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	int val = idz*ny*nx+idy*nx+idx;
+	int i = (idz*ldz)+(idy*ldy)+idx;
 
-	unsigned int i, j, k;
-	int val = 0;
-	for (k = 0; k < nz; k++)
-	{
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					block[(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	}
+	if (block[i] != factor * val)
+		*err = 1;
+	else
+		block[i] *= -1;
 }

+ 1 - 1
tests/datawizard/interfaces/tensor/tensor_interface.c

@@ -18,7 +18,7 @@
 #include "../test_interfaces.h"
 #include "../../../helper.h"
 
-#define NX 16
+#define NX 4
 #define NY NX
 #define NZ NX
 #define NT NX

+ 3 - 3
tests/datawizard/interfaces/tensor/tensor_opencl.c

@@ -87,12 +87,12 @@ test_tensor_opencl_func(void *buffers[], void *args)
 	}
 			
 	{
-                size_t global = 1;
+		size_t global[3] = {nx, ny, nz*nt};
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     0,
 					     NULL,

+ 18 - 26
tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl

@@ -18,32 +18,24 @@ __kernel void tensor_opencl(__global int *tensor,
 			   int ldy, int ldz, int ldt,
 			   int factor, __global int *err)
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2) % nz;
+	const int idt = get_global_id(2) / nz;
+	if (idx >= nx)
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+	if (idt >= nt)
+		return;
+
+	int val = idt*nz*ny*nx+idz*ny*nx+idy*nx+idx;
+	int i = (idt*ldt)+(idz*ldz)+(idy*ldy)+idx;
 
-	unsigned int i, j, k, l;
-	int val = 0;
-	for (l = 0; l < nt; l++)
-	{
-	    for (k = 0; k < nz; k++)
-	    {
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	    }
-	}
+	if (tensor[i] != factor * val)
+		*err = 1;
+	else
+		tensor[i] *= -1;
 }