Kaynağa Gözat

Make examples/reductions/dot_product work with OpenCL.

Cyril Roelandt 13 yıl önce
ebeveyn
işleme
4c003c74ec

+ 8 - 2
examples/Makefile.am

@@ -54,7 +54,8 @@ EXTRA_DIST = 					\
 	filters/fblock_opencl_kernel.cl		\
 	filters/custom_mf/conversion_opencl.cl  \
 	filters/custom_mf/custom_opencl.cl \
-	interface/complex_kernels.cl
+	interface/complex_kernels.cl \
+	reductions/dot_product_opencl_kernels.cl
 
 CLEANFILES = 					\
 	gordon/null_kernel_gordon.spuelf
@@ -137,7 +138,8 @@ noinst_HEADERS = 				\
 	pi/SobolQRNG/sobol.h			\
 	pi/SobolQRNG/sobol_gold.h		\
 	pi/SobolQRNG/sobol_gpu.h		\
-	pi/SobolQRNG/sobol_primitives.h
+	pi/SobolQRNG/sobol_primitives.h         \
+	reductions/dot_product.h
 
 #####################################
 # What to install and what to check #
@@ -753,6 +755,10 @@ if STARPU_USE_CUDA
 reductions_dot_product_SOURCES +=		\
 	reductions/dot_product_kernels.cu
 endif
+if STARPU_USE_OPENCL
+nobase_STARPU_OPENCL_DATA_DATA += \
+	reductions/dot_product_opencl_kernels.cl
+endif
 
 ##################
 # Mandelbrot Set #

+ 162 - 6
examples/reductions/dot_product.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,31 +18,40 @@
 #include <starpu.h>
 #include <assert.h>
 
+#include <reductions/dot_product.h>
+
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cublas.h>
 #include <starpu_cuda.h>
 #endif
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 static float *x;
 static float *y;
 static starpu_data_handle_t *x_handles;
 static starpu_data_handle_t *y_handles;
+#ifdef STARPU_USE_OPENCL
+static struct starpu_opencl_program opencl_program;
+#endif
 
 static unsigned nblocks = 4096;
 static unsigned entries_per_block = 1024;
 
-#define DOT_TYPE double
-
 static DOT_TYPE dot = 0.0f;
 static starpu_data_handle_t dot_handle;
 
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
-	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
+
 #ifdef STARPU_USE_CUDA
 	/* Cuda device */
 	const struct cudaDeviceProp *props;
@@ -73,14 +83,41 @@ void init_cuda_func(void *descr[], void *cl_arg)
 }
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void init_opencl_func(void *buffers[], void *args)
+{
+        cl_int err;
+	cl_command_queue queue;
+
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	starpu_opencl_get_current_queue(&queue);
+	DOT_TYPE zero = (DOT_TYPE) 0.0;
+
+	err = clEnqueueWriteBuffer(queue,
+			dot,
+			CL_TRUE,
+			0,
+			sizeof(DOT_TYPE),
+			&zero,
+			0,
+			NULL,
+			NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+}
+#endif
+
 static struct starpu_codelet init_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.can_execute = can_execute,
 	.cpu_funcs = {init_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {init_cuda_func, NULL},
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {init_opencl_func, NULL},
+#endif
 	.nbuffers = 1
 };
 
@@ -100,14 +137,67 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 extern void redux_cuda_func(void *descr[], void *_args);
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void redux_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
+	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}
+#endif
+
 static struct starpu_codelet redux_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.can_execute = can_execute,
 	.cpu_funcs = {redux_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_func, NULL},
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {redux_opencl_func, NULL},
+#endif
 	.nbuffers = 2
 };
 
@@ -163,14 +253,71 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 }
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void dot_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
+	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
+	err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
+	err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}
+#endif
+
 static struct starpu_codelet dot_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.can_execute = can_execute,
 	.cpu_funcs = {dot_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dot_cuda_func, NULL},
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dot_opencl_func, NULL},
+#endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_REDUX}
 };
@@ -188,6 +335,12 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
 	starpu_helper_cublas_init();
 
 	unsigned long nelems = nblocks*entries_per_block;
@@ -257,6 +410,9 @@ int main(int argc, char **argv)
 
 	starpu_helper_cublas_shutdown();
 
+#ifdef STARPU_USE_OPENCL
+        starpu_opencl_unload_opencl(&opencl_program);
+#endif
 	starpu_shutdown();
 
 	free(x);

+ 6 - 0
examples/reductions/dot_product.h

@@ -0,0 +1,6 @@
+#ifndef DOT_PRODUCT_H__
+#define DOT_PRODUCT_H__
+
+#define DOT_TYPE double
+
+#endif /* DOT_PRODUCT_H__ */

+ 40 - 0
examples/reductions/dot_product_opencl_kernels.cl

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dot_product.h"
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+__kernel void _redux_opencl(__global DOT_TYPE *dota,
+			    __global DOT_TYPE *dotb)
+{
+        const int i = get_global_id(0);
+	*dota += *dotb;
+}
+
+__kernel void _dot_opencl(__global float *x,
+			  __global float *y,
+			  __global DOT_TYPE *dot,
+			  unsigned n)
+{
+	unsigned i;
+	__local double tmp;
+	tmp = 0.0;
+	for (i = 0; i < n ; i++)
+		tmp += x[i]*y[i];
+		
+	*dot += tmp;
+}