14 年之前 · 6cdffa0b55
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -68,6 +68,12 @@ stencil_SOURCES +=				\
 
				 	shadow.cu
			
 
				 endif
			
 
				 
			
 
				+if STARPU_USE_OPENCL
			
 
				+stencil_SOURCES +=				\
			
 
				+	life_opencl.c				\
			
 
				+	shadow_opencl.c
			
 
				+endif
			
 
				+
			
 
				 outs =						\
			
 
				 	0.5.out					\
			
 
				 	0.out					\
			
--- a/examples/stencil/life_opencl.c
+++ b/examples/stencil/life_opencl.c
@@ -0,0 +1,118 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* Heart of the stencil computation: compute a new state from an old one. */
			
 
				+
			
 
				+//#define _externC extern "C"
			
 
				+
			
 
				+#include <stencil.h>
			
 
				+#include <CL/cl.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+#define str(x) #x
			
 
				+
			
 
				+#define clsrc(t,k) "__kernel void\n\
			
 
				+#define TYPE " str(t) "\n\
			
 
				+#define K " str(k) "\n\
			
 
				+life_update(int bz, __global const TYPE *old, __global TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)\n\
			
 
				+{\n\
			
 
				+	unsigned idx = get_global_id(0);\n\
			
 
				+	unsigned idy = get_global_id(1);\n\
			
 
				+	//unsigned idz = threadIdx.z + blockIdx.z * blockDim.z;\n\
			
 
				+	unsigned idz = 0;\n\
			
 
				+	unsigned stepx = get_global_size(0);\n\
			
 
				+	unsigned stepy = get_global_size(1);\n\
			
 
				+	//unsigned stepz = blockDim.z * gridDim.z;\n\
			
 
				+	unsigned stepz = 1;\n\
			
 
				+	unsigned x, y, z;\n\
			
 
				+	unsigned num, alive;\n\
			
 
				+\n\
			
 
				+	for (z = iter + idz; z < nz - iter; z += stepz)\n\
			
 
				+		for (y = K + idy; y < ny - K; y += stepy) {\n\
			
 
				+			for (x = K + idx; x < nx - K; x += stepx) {\n\
			
 
				+				unsigned index = x + y*ldy + z*ldz;\n\
			
 
				+				num = 0\n\
			
 
				+                                        + old[index+1*ldy+0*ldz]\n\
			
 
				+                                        + old[index+1*ldy+1*ldz]\n\
			
 
				+                                        + old[index+0*ldy+1*ldz]\n\
			
 
				+                                        + old[index-1*ldy+1*ldz]\n\
			
 
				+                                        + old[index-1*ldy+0*ldz]\n\
			
 
				+                                        + old[index-1*ldy-1*ldz]\n\
			
 
				+                                        + old[index+0*ldy-1*ldz]\n\
			
 
				+                                        + old[index+1*ldy-1*ldz]\n\
			
 
				+					;\n\
			
 
				+				alive = old[index];\n\
			
 
				+				alive = (alive && num == 2) || num == 3;\n\
			
 
				+				newp[index] = alive;\n\
			
 
				+			}\n\
			
 
				+		}\n\
			
 
				+}"
			
 
				+
			
 
				+static const char * src = clsrc(TYPE,K);
			
 
				+static struct starpu_opencl_program program;
			
 
				+
			
 
				+void
			
 
				+opencl_life_init(void) {
			
 
				+  starpu_opencl_load_opencl_from_string(src, &program);
			
 
				+}
			
 
				+
			
 
				+void opencl_life_free(void) {
			
 
				+  starpu_opencl_unload_opencl(&program);
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+opencl_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)
			
 
				+{
			
 
				+	unsigned max_parallelism = 512;
			
 
				+	unsigned threads_per_dim_x = max_parallelism;
			
 
				+	while (threads_per_dim_x / 2 >= nx)
			
 
				+		threads_per_dim_x /= 2;
			
 
				+	unsigned threads_per_dim_y = max_parallelism / threads_per_dim_x;
			
 
				+	while (threads_per_dim_y / 2 >= ny)
			
 
				+		threads_per_dim_y /= 2;
			
 
				+#if 0
			
 
				+	unsigned threads_per_dim_z = 4;
			
 
				+	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, threads_per_dim_z};
			
 
				+	size_t dimGrid[] = {nx / threads_per_dim_x, ny / threads_per_dim_y, nz / threads_per_dim_z};
			
 
				+#else
			
 
				+	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, 1};
			
 
				+	size_t dimGrid[] = {((nx + threads_per_dim_x-1) / threads_per_dim_x)*threads_per_dim_x, ((ny + threads_per_dim_y-1) / threads_per_dim_y)*threads_per_dim_y, 1};
			
 
				+#endif
			
 
				+
			
 
				+  int devid,id;
			
 
				+  id = starpu_worker_get_id();
			
 
				+  devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+  cl_kernel kernel;
			
 
				+  cl_command_queue cq;
			
 
				+  starpu_opencl_load_kernel(&kernel, &cq, &program, "life_update", devid);
			
 
				+
			
 
				+  clSetKernelArg(kernel, 0, sizeof(bz), &bz);
			
 
				+  clSetKernelArg(kernel, 1, sizeof(old), &old);
			
 
				+  clSetKernelArg(kernel, 2, sizeof(newp), &newp);
			
 
				+  clSetKernelArg(kernel, 3, sizeof(nx), &nx);
			
 
				+  clSetKernelArg(kernel, 4, sizeof(ny), &ny);
			
 
				+  clSetKernelArg(kernel, 5, sizeof(nz), &nz);
			
 
				+  clSetKernelArg(kernel, 6, sizeof(ldy), &ldy);
			
 
				+  clSetKernelArg(kernel, 7, sizeof(ldz), &ldz);
			
 
				+  clSetKernelArg(kernel, 8, sizeof(iter), &iter);
			
 
				+
			
 
				+  cl_event ev;
			
 
				+  clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dimGrid, dimBlock, 0, NULL, &ev);
			
 
				+  clWaitForEvents(1, &ev);
			
 
				+  clReleaseEvent(ev);
			
 
				+}
			
--- a/examples/stencil/shadow_opencl.c
+++ b/examples/stencil/shadow_opencl.c
@@ -0,0 +1,120 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "stencil.h"
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+/* Perform replication of data on X and Y edges, to fold the domain on 
			
 
				+   itself through mere replication of the source state. */
			
 
				+
			
 
				+#define str(x) #x
			
 
				+
			
 
				+#define clsrc(t,k) "__kernel void\n\
			
 
				+#define TYPE " str(t) "\n\
			
 
				+#define K " str(k) "\n\
			
 
				+shadow( int bz, __global TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i)\n\
			
 
				+{\n\
			
 
				+	unsigned idx = get_global_id(0);\n\
			
 
				+	unsigned idy = get_global_id(1);\n\
			
 
				+	//unsigned idz = threadIdx.z + blockIdx.z * blockDim.z;\n\
			
 
				+	unsigned idz = 0;\n\
			
 
				+	unsigned stepx = get_global_size(0);\n\
			
 
				+	unsigned stepy = get_global_size(1);\n\
			
 
				+	//unsigned stepz = blockDim.z * gridDim.z;\n\
			
 
				+	unsigned stepz = 1;\n\
			
 
				+	unsigned x, y, z;\n\
			
 
				+	if (idy == 0)\n\
			
 
				+		for (z = i-1 + idz; z < nz-(i-1); z += stepz)\n\
			
 
				+			for (x = K + idx; x < nx-K; x += stepx) {\n\
			
 
				+				unsigned index = x+z*ldz;\n\
			
 
				+				ptr[index+(K-1)*ldy] = ptr[index+(ny-K-1)*ldy];\n\
			
 
				+				ptr[index+(ny-K)*ldy] = ptr[index+K*ldy];\n\
			
 
				+			}\n\
			
 
				+\n\
			
 
				+	if (idx == 0)\n\
			
 
				+		for (z = i-1 + idz; z < nz-(i-1); z += stepz)\n\
			
 
				+			for (y = K + idy; y < ny-K; y += stepy) {\n\
			
 
				+				unsigned index = y*ldy+z*ldz;\n\
			
 
				+				ptr[(K-1)+index] = ptr[(nx-K-1)+index];\n\
			
 
				+				ptr[(nx-K)+index] = ptr[K+index];\n\
			
 
				+			}\n\
			
 
				+\n\
			
 
				+	if (idx == 0 && idy == 0)\n\
			
 
				+		for (z = i-1 + idz; z < nz-(i-1); z += stepz) {\n\
			
 
				+			unsigned index = z*ldz;\n\
			
 
				+			ptr[K-1+(K-1)*ldy+index] = ptr[(nx-K-1)+(ny-K-1)*ldy+index];\n\
			
 
				+			ptr[(nx-K)+(K-1)*ldy+index] = ptr[K+(ny-K-1)*ldy+index];\n\
			
 
				+			ptr[(K-1)+(ny-K)*ldy+index] = ptr[(nx-K-1)+K*ldy+index];\n\
			
 
				+			ptr[(nx-K)+(ny-K)*ldy+index] = ptr[K+K*ldy+index];\n\
			
 
				+		}\n\
			
 
				+}"
			
 
				+
			
 
				+static const char * src = clsrc(TYPE,K);
			
 
				+static struct starpu_opencl_program program;
			
 
				+
			
 
				+void
			
 
				+opencl_shadow_init(void) {
			
 
				+  starpu_opencl_load_opencl_from_string(src, &program);
			
 
				+}
			
 
				+
			
 
				+void opencl_shadow_free(void) {
			
 
				+  starpu_opencl_unload_opencl(&program);
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i)
			
 
				+{
			
 
				+	unsigned max_parallelism = 512;
			
 
				+	unsigned threads_per_dim_x = max_parallelism;
			
 
				+	while (threads_per_dim_x / 2 >= nx)
			
 
				+		threads_per_dim_x /= 2;
			
 
				+	unsigned threads_per_dim_y = max_parallelism / threads_per_dim_x;
			
 
				+	while (threads_per_dim_y / 2 >= ny)
			
 
				+		threads_per_dim_y /= 2;
			
 
				+#if 0
			
 
				+	unsigned threads_per_dim_z = 4;
			
 
				+	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, threads_per_dim_z};
			
 
				+	size_t dimGrid[] = {nx / threads_per_dim_x, ny / threads_per_dim_y, nz / threads_per_dim_z};
			
 
				+#else
			
 
				+	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, 1};
			
 
				+	size_t dimGrid[] = {((nx + threads_per_dim_x-1) / threads_per_dim_x)*threads_per_dim_x, ((ny + threads_per_dim_y-1) / threads_per_dim_y)*threads_per_dim_y, 1};
			
 
				+#endif
			
 
				+
			
 
				+        int devid,id;
			
 
				+        id = starpu_worker_get_id();
			
 
				+        devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+        cl_kernel kernel;
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_load_kernel(&kernel, &cq, &program, "shadow", devid);
			
 
				+
			
 
				+        clSetKernelArg(kernel, 0, sizeof(bz), &bz);
			
 
				+        clSetKernelArg(kernel, 1, sizeof(ptr), &ptr);
			
 
				+        clSetKernelArg(kernel, 2, sizeof(nx), &nx);
			
 
				+        clSetKernelArg(kernel, 3, sizeof(ny), &ny);
			
 
				+        clSetKernelArg(kernel, 4, sizeof(nz), &nz);
			
 
				+        clSetKernelArg(kernel, 5, sizeof(ldy), &ldy);
			
 
				+        clSetKernelArg(kernel, 6, sizeof(ldz), &ldz);
			
 
				+        clSetKernelArg(kernel, 7, sizeof(i), &i);
			
 
				+
			
 
				+        cl_event ev;
			
 
				+        cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dimGrid, dimBlock, 0, NULL, &ev);
			
 
				+        if (err != CL_SUCCESS)
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+        clWaitForEvents(1, &ev);
			
 
				+        clReleaseEvent(ev);
			
 
				+}
			
 
				+
			
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -16,6 +16,11 @@
 
				 #include "stencil.h"
			
 
				 #include <sys/time.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <CL/cl.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				 #ifndef timersub
			
 
				 #define	timersub(x, y, res) \
			
 
				 	do { \
			
@@ -258,6 +263,97 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				 /*
			
 
				+ * Load a neighbour's boundary into block, OpenCL version
			
 
				+ */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void load_subblock_from_buffer_opencl(starpu_block_interface_t *block,
			
 
				+					starpu_block_interface_t *boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	cl_mem block_data = (cl_mem)block->ptr;
			
 
				+	cl_mem boundary_data = (cl_mem)boundary->ptr;
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+        clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * cl_update (OpenCL version)
			
 
				+ */
			
 
				+static void update_func_opencl(void *descr[], void *arg)
			
 
				+{
			
 
				+	struct block_description *block = arg;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+	if (block->bz == 0)
			
 
				+fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, workerid);
			
 
				+	else
			
 
				+	DEBUG( "!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, workerid);
			
 
				+#ifdef STARPU_USE_MPI
			
 
				+	int rank = 0;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	DEBUG( "!!!           RANK %d              !!!\n", rank);
			
 
				+#endif
			
 
				+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				+
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+	unsigned i;
			
 
				+	update_per_worker[workerid]++;
			
 
				+
			
 
				+	record_who_runs_what(block);
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : TOP
			
 
				+	 */
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K) */
			
 
				+	load_subblock_from_buffer_opencl(descr[0], descr[2], block_size_z+K);
			
 
				+	load_subblock_from_buffer_opencl(descr[1], descr[3], block_size_z+K);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Load neighbours' boundaries : BOTTOM
			
 
				+	 */
			
 
				+	load_subblock_from_buffer_opencl(descr[0], descr[4], 0);
			
 
				+	load_subblock_from_buffer_opencl(descr[1], descr[5], 0);
			
 
				+
			
 
				+	/*
			
 
				+	 *	Stencils ... do the actual work here :) TODO
			
 
				+	 */
			
 
				+
			
 
				+	for (i=1; i<=K; i++)
			
 
				+	{
			
 
				+		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				+		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
			
 
				+
			
 
				+		/* Shadow data */
			
 
				+		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+
			
 
				+		/* And perform actual computation */
			
 
				+#ifdef LIFE
			
 
				+		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+#else
			
 
				+                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
			
 
				+#endif /* LIFE */
			
 
				+	}
			
 
				+
			
 
				+	cl_int err;
			
 
				+	if ((err = clFinish(cq)))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+}
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				+
			
 
				+/*
			
 
				  * cl_update (CPU version)
			
 
				  */
			
 
				 static void update_func_cpu(void *descr[], void *arg)
			
@@ -335,15 +431,21 @@ static struct starpu_perfmodel_t cl_update_model = {
 
				 };
			
 
				 
			
 
				 starpu_codelet cl_update = {
			
 
				-	.where = 
			
 
				+	.where = 0 |
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		STARPU_CUDA|
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                STARPU_OPENCL|
			
 
				+#endif
			
 
				 		STARPU_CPU,
			
 
				 	.cpu_func = update_func_cpu,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = update_func_cuda,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func = update_func_opencl,
			
 
				+#endif
			
 
				 	.model = &cl_update_model,
			
 
				 	.nbuffers = 6
			
 
				 };
			
@@ -386,6 +488,28 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 
				 }
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				+/* OPENCL version */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void load_subblock_into_buffer_opencl(starpu_block_interface_t *block,
			
 
				+					starpu_block_interface_t *boundary,
			
 
				+					unsigned firstz)
			
 
				+{
			
 
				+	check_load(block, boundary);
			
 
				+
			
 
				+	/* We do a contiguous memory transfer */
			
 
				+	size_t boundary_size = K*block->ldz*block->elemsize;
			
 
				+
			
 
				+	unsigned offset = firstz*block->ldz;
			
 
				+	cl_mem block_data = (cl_mem)block->ptr;
			
 
				+	cl_mem boundary_data = (cl_mem)boundary->ptr;
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+
			
 
				+        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
			
 
				+}
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				+
			
 
				 /* Record how many top/bottom saves each worker performed */
			
 
				 unsigned top_per_worker[STARPU_NMAXWORKERS];
			
 
				 unsigned bottom_per_worker[STARPU_NMAXWORKERS];
			
@@ -452,6 +576,45 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
				 }
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				+/* top save, OpenCL version */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
			
 
				+{
			
 
				+	struct block_description *block = arg;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	top_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Top block %d\n", block->bz);
			
 
				+
			
 
				+	/* The offset along the z axis is (block_size_z + K)- K */
			
 
				+	unsigned block_size_z = get_block_size(block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_opencl(descr[0], descr[2], block_size_z);
			
 
				+	load_subblock_into_buffer_opencl(descr[1], descr[3], block_size_z);
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+        clFinish(cq);
			
 
				+}
			
 
				+
			
 
				+/* bottom save, OPENCL version */
			
 
				+static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
			
 
				+{
			
 
				+	struct block_description *block = arg;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	bottom_per_worker[workerid]++;
			
 
				+
			
 
				+	DEBUG( "DO SAVE Bottom block %d on OPENCL\n", block->bz);
			
 
				+
			
 
				+	load_subblock_into_buffer_opencl(descr[0], descr[2], K);
			
 
				+	load_subblock_into_buffer_opencl(descr[1], descr[3], K);
			
 
				+
			
 
				+        cl_command_queue cq;
			
 
				+        starpu_opencl_get_current_queue(&cq);
			
 
				+        clFinish(cq);
			
 
				+}
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				+
			
 
				 /* Performance models and codelet for save */
			
 
				 static struct starpu_perfmodel_t save_cl_bottom_model = {
			
 
				 	.type = STARPU_HISTORY_BASED,
			
@@ -464,29 +627,41 @@ static struct starpu_perfmodel_t save_cl_top_model = {
 
				 };
			
 
				 
			
 
				 starpu_codelet save_cl_bottom = {
			
 
				-	.where = 
			
 
				+	.where = 0 |
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		STARPU_CUDA|
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		STARPU_OPENCL|
			
 
				+#endif
			
 
				 		STARPU_CPU,
			
 
				 	.cpu_func = dummy_func_bottom_cpu,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = dummy_func_bottom_cuda,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func = dummy_func_bottom_opencl,
			
 
				+#endif
			
 
				 	.model = &save_cl_bottom_model,
			
 
				 	.nbuffers = 4
			
 
				 };
			
 
				 
			
 
				 starpu_codelet save_cl_top = {
			
 
				-	.where = 
			
 
				+	.where = 0|
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		STARPU_CUDA|
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		STARPU_OPENCL|
			
 
				+#endif
			
 
				 		STARPU_CPU,
			
 
				 	.cpu_func = dummy_func_top_cpu,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = dummy_func_top_cuda,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func = dummy_func_top_opencl,
			
 
				+#endif
			
 
				 	.model = &save_cl_top_model,
			
 
				 	.nbuffers = 4
			
 
				 };
			
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -219,9 +219,10 @@ void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
 
				 /* Dummy empty codelet taking one buffer */
			
 
				 static void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
			
 
				 static starpu_codelet null = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = null_func,
			
 
				 	.cuda_func = null_func,
			
 
				+	.opencl_func = null_func,
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -197,6 +197,11 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_initialize();
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        opencl_life_init();
			
 
				+        opencl_shadow_init();
			
 
				+#endif /*STARPU_USE_OPENCL*/
			
 
				+
			
 
				 	init_problem(argc, argv, rank, world_size);
			
 
				 
			
 
				 	create_tasks(rank);
			
@@ -317,5 +322,10 @@ int main(int argc, char **argv)
 
				 	MPI_Finalize();
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        opencl_life_free();
			
 
				+        opencl_shadow_free();
			
 
				+#endif /*STARPU_USE_OPENCL*/
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/stencil/stencil.h
+++ b/examples/stencil/stencil.h
@@ -133,6 +133,7 @@ extern struct timeval *last_tick;
 
				 #ifndef _externC
			
 
				 #define _externC
			
 
				 #endif
			
 
				+
			
 
				 _externC void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
			
 
				 _externC void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i);