|
@@ -1,6 +1,7 @@
|
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
*
|
|
|
* Copyright (C) 2010-2011 Université de Bordeaux 1
|
|
|
+ * Copyright (C) 2012 inria
|
|
|
*
|
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
@@ -17,31 +18,40 @@
|
|
|
#include <starpu.h>
|
|
|
#include <assert.h>
|
|
|
|
|
|
+#include <reductions/dot_product.h>
|
|
|
+
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
#include <cuda.h>
|
|
|
#include <cublas.h>
|
|
|
#include <starpu_cuda.h>
|
|
|
#endif
|
|
|
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+#include <starpu_opencl.h>
|
|
|
+#endif
|
|
|
+
|
|
|
#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
|
|
|
|
|
|
static float *x;
|
|
|
static float *y;
|
|
|
static starpu_data_handle_t *x_handles;
|
|
|
static starpu_data_handle_t *y_handles;
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+static struct starpu_opencl_program opencl_program;
|
|
|
+#endif
|
|
|
|
|
|
static unsigned nblocks = 4096;
|
|
|
static unsigned entries_per_block = 1024;
|
|
|
|
|
|
-#define DOT_TYPE double
|
|
|
-
|
|
|
static DOT_TYPE dot = 0.0f;
|
|
|
static starpu_data_handle_t dot_handle;
|
|
|
|
|
|
static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
|
|
|
{
|
|
|
- if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
|
|
|
+ enum starpu_archtype type = starpu_worker_get_type(workerid);
|
|
|
+ if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
|
|
|
return 1;
|
|
|
+
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
/* Cuda device */
|
|
|
const struct cudaDeviceProp *props;
|
|
@@ -73,14 +83,41 @@ void init_cuda_func(void *descr[], void *cl_arg)
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+void init_opencl_func(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ cl_int err;
|
|
|
+ cl_command_queue queue;
|
|
|
+
|
|
|
+ cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
|
|
|
+ starpu_opencl_get_current_queue(&queue);
|
|
|
+ DOT_TYPE zero = (DOT_TYPE) 0.0;
|
|
|
+
|
|
|
+ err = clEnqueueWriteBuffer(queue,
|
|
|
+ dot,
|
|
|
+ CL_TRUE,
|
|
|
+ 0,
|
|
|
+ sizeof(DOT_TYPE),
|
|
|
+ &zero,
|
|
|
+ 0,
|
|
|
+ NULL,
|
|
|
+ NULL);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
static struct starpu_codelet init_codelet =
|
|
|
{
|
|
|
- .where = STARPU_CPU|STARPU_CUDA,
|
|
|
.can_execute = can_execute,
|
|
|
.cpu_funcs = {init_cpu_func, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
.cuda_funcs = {init_cuda_func, NULL},
|
|
|
#endif
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+ .opencl_funcs = {init_opencl_func, NULL},
|
|
|
+#endif
|
|
|
.nbuffers = 1
|
|
|
};
|
|
|
|
|
@@ -100,14 +137,67 @@ void redux_cpu_func(void *descr[], void *cl_arg)
|
|
|
extern void redux_cuda_func(void *descr[], void *_args);
|
|
|
#endif
|
|
|
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+void redux_opencl_func(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ int id, devid;
|
|
|
+ cl_int err;
|
|
|
+ cl_kernel kernel;
|
|
|
+ cl_command_queue queue;
|
|
|
+ cl_event event;
|
|
|
+
|
|
|
+ cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
|
|
|
+ cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
|
|
|
+
|
|
|
+ id = starpu_worker_get_id();
|
|
|
+ devid = starpu_worker_get_devid(id);
|
|
|
+
|
|
|
+ err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+
|
|
|
+ err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
|
|
|
+ err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+
|
|
|
+ {
|
|
|
+ size_t global=1;
|
|
|
+ size_t local;
|
|
|
+ size_t s;
|
|
|
+ cl_device_id device;
|
|
|
+
|
|
|
+ starpu_opencl_get_device(devid, &device);
|
|
|
+
|
|
|
+ err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+ if (local > global)
|
|
|
+ local=global;
|
|
|
+
|
|
|
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+ }
|
|
|
+
|
|
|
+ clFinish(queue);
|
|
|
+ starpu_opencl_collect_stats(event);
|
|
|
+ clReleaseEvent(event);
|
|
|
+
|
|
|
+ starpu_opencl_release_kernel(kernel);
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
static struct starpu_codelet redux_codelet =
|
|
|
{
|
|
|
- .where = STARPU_CPU|STARPU_CUDA,
|
|
|
.can_execute = can_execute,
|
|
|
.cpu_funcs = {redux_cpu_func, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
.cuda_funcs = {redux_cuda_func, NULL},
|
|
|
#endif
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+ .opencl_funcs = {redux_opencl_func, NULL},
|
|
|
+#endif
|
|
|
.nbuffers = 2
|
|
|
};
|
|
|
|
|
@@ -163,14 +253,71 @@ void dot_cuda_func(void *descr[], void *cl_arg)
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+void dot_opencl_func(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ int id, devid;
|
|
|
+ cl_int err;
|
|
|
+ cl_kernel kernel;
|
|
|
+ cl_command_queue queue;
|
|
|
+ cl_event event;
|
|
|
+
|
|
|
+ cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
|
|
|
+ cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
|
|
|
+ cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
|
|
|
+ unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
|
|
|
+
|
|
|
+ id = starpu_worker_get_id();
|
|
|
+ devid = starpu_worker_get_devid(id);
|
|
|
+
|
|
|
+ err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+
|
|
|
+ err = clSetKernelArg(kernel, 0, sizeof(x), &x);
|
|
|
+ err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
|
|
|
+ err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
|
|
|
+ err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+
|
|
|
+ {
|
|
|
+ size_t global=1;
|
|
|
+ size_t local;
|
|
|
+ size_t s;
|
|
|
+ cl_device_id device;
|
|
|
+
|
|
|
+ starpu_opencl_get_device(devid, &device);
|
|
|
+
|
|
|
+ err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+ if (local > global)
|
|
|
+ local=global;
|
|
|
+
|
|
|
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
+ }
|
|
|
+
|
|
|
+ clFinish(queue);
|
|
|
+ starpu_opencl_collect_stats(event);
|
|
|
+ clReleaseEvent(event);
|
|
|
+
|
|
|
+ starpu_opencl_release_kernel(kernel);
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
static struct starpu_codelet dot_codelet =
|
|
|
{
|
|
|
- .where = STARPU_CPU|STARPU_CUDA,
|
|
|
.can_execute = can_execute,
|
|
|
.cpu_funcs = {dot_cpu_func, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
.cuda_funcs = {dot_cuda_func, NULL},
|
|
|
#endif
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+ .opencl_funcs = {dot_opencl_func, NULL},
|
|
|
+#endif
|
|
|
.nbuffers = 3,
|
|
|
.modes = {STARPU_R, STARPU_R, STARPU_REDUX}
|
|
|
};
|
|
@@ -188,6 +335,12 @@ int main(int argc, char **argv)
|
|
|
return 77;
|
|
|
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
|
|
|
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+ ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
|
|
|
+ &opencl_program, NULL);
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
|
|
|
+#endif
|
|
|
+
|
|
|
starpu_helper_cublas_init();
|
|
|
|
|
|
unsigned long nelems = nblocks*entries_per_block;
|
|
@@ -257,6 +410,9 @@ int main(int argc, char **argv)
|
|
|
|
|
|
starpu_helper_cublas_shutdown();
|
|
|
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+ starpu_opencl_unload_opencl(&opencl_program);
|
|
|
+#endif
|
|
|
starpu_shutdown();
|
|
|
|
|
|
free(x);
|