|
@@ -18,6 +18,9 @@
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
#include <cuda.h>
|
|
|
#endif
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+#include <CL/cl.h>
|
|
|
+#endif
|
|
|
|
|
|
#define INIT_VALUE 42
|
|
|
#define NTASKS 10000
|
|
@@ -29,21 +32,32 @@ static uintptr_t per_worker[STARPU_NMAXWORKERS];
|
|
|
static starpu_data_handle per_worker_handle[STARPU_NMAXWORKERS];
|
|
|
|
|
|
/* Create per-worker handles */
|
|
|
-
|
|
|
static void initialize_per_worker_handle(void *arg __attribute__((unused)))
|
|
|
{
|
|
|
int workerid = starpu_worker_get_id();
|
|
|
|
|
|
/* Allocate memory on the worker, and initialize it to 0 */
|
|
|
-
|
|
|
switch (starpu_worker_get_type(workerid)) {
|
|
|
case STARPU_CPU_WORKER:
|
|
|
per_worker[workerid] = (uintptr_t)calloc(1, sizeof(variable));
|
|
|
break;
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
case STARPU_OPENCL_WORKER:
|
|
|
- /* Not supported yet */
|
|
|
- STARPU_ABORT();
|
|
|
+ {
|
|
|
+ cl_context context;
|
|
|
+ cl_command_queue queue;
|
|
|
+ starpu_opencl_get_current_context(&context);
|
|
|
+ starpu_opencl_get_current_queue(&queue);
|
|
|
+
|
|
|
+ cl_mem ptr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(variable), NULL, NULL);
|
|
|
+ /* Poor's man memset */
|
|
|
+ unsigned zero = 0;
|
|
|
+ clEnqueueWriteBuffer(queue, ptr, CL_TRUE, 0, sizeof(variable), (void *)&zero, 0, NULL, NULL);
|
|
|
+ per_worker[workerid] = (uintptr_t)ptr;
|
|
|
+ }
|
|
|
+
|
|
|
break;
|
|
|
+#endif
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
case STARPU_CUDA_WORKER:
|
|
|
cudaMalloc((void **)&per_worker[workerid], sizeof(variable));
|
|
@@ -90,15 +104,43 @@ static void cpu_func_incr(void *descr[], void *cl_arg __attribute__((unused)))
|
|
|
*val = *val + 1;
|
|
|
}
|
|
|
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
-extern void cuda_codelet_unsigned_inc(void *descr[], void *cl_arg);
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+/* dummy CUDA implementation */
|
|
|
+static void cuda_func_incr(void *descr[], void *cl_arg __attribute__((unused)))
|
|
|
+{
|
|
|
+ unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
|
|
|
+
|
|
|
+ unsigned h_val;
|
|
|
+ cudaMemcpy(&h_val, val, sizeof(unsigned), cudaMemcpyDeviceToHost);
|
|
|
+ h_val++;
|
|
|
+ cudaMemcpy(val, h_val, sizeof(unsigned), cudaMemcpyHostToDevice);
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+/* dummy OpenCL implementation */
|
|
|
+static void opencl_func_incr(void *descr[], void *cl_arg __attribute__((unused)))
|
|
|
+{
|
|
|
+ cl_mem d_val = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
|
|
|
+ unsigned h_val;
|
|
|
+
|
|
|
+ cl_command_queue queue;
|
|
|
+ starpu_opencl_get_current_queue(&queue);
|
|
|
+
|
|
|
+ clEnqueueReadBuffer(queue, d_val, CL_TRUE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
|
|
|
+ h_val++;
|
|
|
+ clEnqueueWriteBuffer(queue, d_val, CL_TRUE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
|
|
|
+}
|
|
|
#endif
|
|
|
|
|
|
static struct starpu_codelet_t use_data_on_worker_codelet = {
|
|
|
- .where = STARPU_CPU|STARPU_CUDA,
|
|
|
+ .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
|
|
|
.cpu_func = cpu_func_incr,
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = cuda_codelet_unsigned_inc,
|
|
|
+ .cuda_func = cuda_func_incr,
|
|
|
+#endif
|
|
|
+#ifdef STARPU_USE_OPENCL
|
|
|
+ .opencl_func = opencl_func_incr,
|
|
|
#endif
|
|
|
.nbuffers = 1,
|
|
|
.model = NULL
|
|
@@ -118,7 +160,7 @@ int main(int argc, char **argv)
|
|
|
starpu_variable_data_register(&variable_handle, 0, (uintptr_t)&variable, sizeof(unsigned));
|
|
|
|
|
|
/* Allocate a per-worker handle on each worker (and initialize it to 0) */
|
|
|
- starpu_execute_on_each_worker(initialize_per_worker_handle, NULL, STARPU_CPU|STARPU_CUDA);
|
|
|
+ starpu_execute_on_each_worker(initialize_per_worker_handle, NULL, STARPU_CPU|STARPU_CUDA|STARPU_OPENCL);
|
|
|
|
|
|
/* Register all per-worker handles */
|
|
|
for (worker = 0; worker < nworkers; worker++)
|