Browse Source

* New codelet specific_nodes field to specify explicit target nodes for data.

Samuel Thibault 11 years ago
parent
commit
fedd296373

+ 1 - 0
ChangeLog

@@ -42,6 +42,7 @@ New features:
     sequential consistency defined for the given data)
     sequential consistency defined for the given data)
   * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
   * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
   * New functions starpu_pause() and starpu_resume()
   * New functions starpu_pause() and starpu_resume()
+  * New codelet specific_nodes field to specify explicit target nodes for data.
 
 
 Small features:
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and
   * New functions starpu_data_acquire_cb_sequential_consistency() and

+ 25 - 0
doc/doxygen/chapters/07data_management.doxy

@@ -504,5 +504,30 @@ The whole code for this complex data interface is available in the
 directory <c>examples/interface/</c>.
 directory <c>examples/interface/</c>.
 
 
 
 
+\section SpecifyingATargetNode Specifying a target node for task data
+
+When executing a task on a GPU for instance, StarPU would normally copy all the
+needed data for the tasks on the embedded memory of the GPU.  It may however
+happen that the task kernel would rather have some of the datas kept in the
+main memory instead of copied in the GPU, a pivoting vector for instance.
+This can be achieved by setting the starpu_codelet::specific_nodes flag to
+1, and then fill the starpu_codelet::nodes array (or starpu_codelet when
+starpu_codelet::nbuffers is greater than STARPU_NMAXBUFS) with the node numbers
+where data should be copied to, or -1 to let StarPU copy it to the memory node
+where the task will be executed . For instance, with the following codelet:
+
+\code{.c}
+struct starpu_codelet cl =
+{
+	.cuda_funcs = { kernel, NULL },
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_MAIN_RAM, -1},
+};
+\endcode
+
+the first data of the task will be kept in the main memory, while the second
+data will be copied to the CUDA GPU as usual.
 
 
 */
 */

+ 24 - 1
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -278,13 +278,36 @@ unsufficient, this value can be set with the configure option
 
 
 \var starpu_codelet::dyn_modes
 \var starpu_codelet::dyn_modes
 Is an array of ::starpu_data_access_mode. It describes the required
 Is an array of ::starpu_data_access_mode. It describes the required
-access modes to the data neeeded by the codelet (e.g. ::STARPU_RW).
+access modes to the data needed by the codelet (e.g. ::STARPU_RW).
 The number of entries in this array must be specified in the field
 The number of entries in this array must be specified in the field
 starpu_codelet::nbuffers. This field should be used for codelets having a
 starpu_codelet::nbuffers. This field should be used for codelets having a
 number of datas greater than \ref STARPU_NMAXBUFS (see \ref
 number of datas greater than \ref STARPU_NMAXBUFS (see \ref
 SettingTheDataHandlesForATask). When defining a codelet, one
 SettingTheDataHandlesForATask). When defining a codelet, one
 should either define this field or the field starpu_codelet::modes defined above.
 should either define this field or the field starpu_codelet::modes defined above.
 
 
+\var starpu_codelet::specific_nodes.
+Default value is 0. If this flag is set, StarPU will not systematically
+send all data to the memory node where the task will be executing, it
+will read the starpu_codelet::nodes or starpu_codelet::dyn_nodes array to
+determine, for each data, whether to send it on the memory node where the task
+will be executing (-1), or on a specific node (!= -1).
+
+\var starpu_codelet::nodes.
+Optional field. When starpu_codelet::specific_nodes is 1, this specifies
+the memory nodes where each data should be sent to for task execution.
+The number of entries in this array is starpu_codelet::nbuffers, and should
+not exceed \ref STARPU_NMAXBUFS.
+
+\var starpu_codelet::dyn_nodes
+Optional field. When starpu_codelet::specific_nodes is 1, this specifies
+the memory nodes where each data should be sent to for task execution.
+The number of entries in this array is starpu_codelet::nbuffers.
+This field should be used for codelets having a
+number of datas greater than \ref STARPU_NMAXBUFS (see \ref
+SettingTheDataHandlesForATask). When defining a codelet, one
+should either define this field or the field starpu_codelet::nodes defined
+above.
+
 \var starpu_codelet::model
 \var starpu_codelet::model
 Optional pointer to the task duration performance model associated to
 Optional pointer to the task duration performance model associated to
 this codelet. This optional field is ignored when set to <c>NULL</c> or when
 this codelet. This optional field is ignored when set to <c>NULL</c> or when

+ 8 - 1
include/starpu_task.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
@@ -100,6 +100,10 @@ struct starpu_codelet
 	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 	enum starpu_data_access_mode *dyn_modes;
 	enum starpu_data_access_mode *dyn_modes;
 
 
+	unsigned specific_nodes;
+	int nodes[STARPU_NMAXBUFS];
+	int *dyn_nodes;
+
 	struct starpu_perfmodel *model;
 	struct starpu_perfmodel *model;
 	struct starpu_perfmodel *power_model;
 	struct starpu_perfmodel *power_model;
 
 
@@ -209,6 +213,9 @@ struct starpu_task
 #define STARPU_CODELET_GET_MODE(codelet, i) ((codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i])
 #define STARPU_CODELET_GET_MODE(codelet, i) ((codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i])
 #define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
 #define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
 
 
+#define STARPU_CODELET_GET_NODE(codelet, i) ((codelet->dyn_nodes) ? codelet->dyn_nodes[i] : codelet->nodes[i])
+#define STARPU_CODELET_SET_NODE(codelet, __node, i) do { if (codelet->dyn_nodes) codelet->dyn_nodes[i] = __node; else codelet->nodes[i] = __node; } while(0)
+
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);
 void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);
 
 

+ 5 - 1
src/core/dependencies/data_concurrency.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -254,6 +254,10 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
+		int node = -1;
+		if (j->task->cl->specific_nodes)
+			node = STARPU_CODELET_GET_NODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 	}
 	}
 
 
 	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), cl->nbuffers);
 	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), cl->nbuffers);

+ 4 - 1
src/core/jobs.h

@@ -56,6 +56,7 @@ typedef void (*_starpu_cl_func_t)(void **, void *);
 struct _starpu_data_descr {
 struct _starpu_data_descr {
 	starpu_data_handle_t handle;
 	starpu_data_handle_t handle;
 	enum starpu_data_access_mode mode;
 	enum starpu_data_access_mode mode;
+	int node;
 };
 };
 
 
 /* A job is the internal representation of a task. */
 /* A job is the internal representation of a task. */
@@ -180,9 +181,11 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 
 
 #define _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].handle : job->ordered_buffers[i].handle)
 #define _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].handle : job->ordered_buffers[i].handle)
 #define _STARPU_JOB_GET_ORDERED_BUFFER_MODE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].mode : job->ordered_buffers[i].mode)
 #define _STARPU_JOB_GET_ORDERED_BUFFER_MODE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].mode : job->ordered_buffers[i].mode)
+#define _STARPU_JOB_GET_ORDERED_BUFFER_NODE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].node : job->ordered_buffers[i].node)
 
 
 #define _STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(job, handle, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].handle = (handle); else job->ordered_buffers[i].handle = (handle);} while(0)
 #define _STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(job, handle, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].handle = (handle); else job->ordered_buffers[i].handle = (handle);} while(0)
-#define _STARPU_JOB_SET_ORDERED_BUFFER_MODE(job, mode, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].mode = mode; else job->ordered_buffers[i].mode = mode;} while(0)
+#define _STARPU_JOB_SET_ORDERED_BUFFER_MODE(job, __mode, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].mode = __mode; else job->ordered_buffers[i].mode = __mode;} while(0)
+#define _STARPU_JOB_SET_ORDERED_BUFFER_NODE(job, __node, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].node = __node; else job->ordered_buffers[i].node = __node;} while(0)
 
 
 #define _STARPU_JOB_SET_ORDERED_BUFFER(job, buffer, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i] = buffer; else job->ordered_buffers[i] = buffer;} while(0)
 #define _STARPU_JOB_SET_ORDERED_BUFFER(job, buffer, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i] = buffer; else job->ordered_buffers[i] = buffer;} while(0)
 #define _STARPU_JOB_GET_ORDERED_BUFFERS(job) (job->dyn_ordered_buffers) ? job->dyn_ordered_buffers : job->ordered_buffers
 #define _STARPU_JOB_GET_ORDERED_BUFFERS(job) (job->dyn_ordered_buffers) ? job->dyn_ordered_buffers : job->ordered_buffers

+ 9 - 1
src/core/task.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
@@ -599,6 +599,10 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 			enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 			enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
+			int node = -1;
+			if (j->task->cl->specific_nodes)
+				node = STARPU_CODELET_GET_NODE(j->task->cl, i);
+			_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 		}
 		}
 	}
 	}
 
 
@@ -659,6 +663,10 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
+		int node = -1;
+		if (j->task->cl->specific_nodes)
+			node = STARPU_CODELET_GET_NODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 	}
 	}
 
 
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();

+ 18 - 6
src/datawizard/coherency.c

@@ -678,13 +678,13 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 	return 0;
 	return 0;
 }
 }
 
 
-static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned local_memory_node)
+static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
 {
 {
 	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 		return &handle->per_worker[workerid];
 		return &handle->per_worker[workerid];
 	else
 	else
 		/* That's a "normal" buffer (R/W) */
 		/* That's a "normal" buffer (R/W) */
-		return &handle->per_node[local_memory_node];
+		return &handle->per_node[node];
 }
 }
 
 
 int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
@@ -713,6 +713,9 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 		int ret;
 		int ret;
 		starpu_data_handle_t handle = descrs[index].handle;
 		starpu_data_handle_t handle = descrs[index].handle;
 		enum starpu_data_access_mode mode = descrs[index].mode;
 		enum starpu_data_access_mode mode = descrs[index].mode;
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 
 		struct _starpu_data_replicate *local_replicate;
 		struct _starpu_data_replicate *local_replicate;
 
 
@@ -722,7 +725,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 			 * _starpu_compar_handles */
 			 * _starpu_compar_handles */
 			continue;
 			continue;
 
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 
 		ret = fetch_data(handle, local_replicate, mode);
 		ret = fetch_data(handle, local_replicate, mode);
 		if (STARPU_UNLIKELY(ret))
 		if (STARPU_UNLIKELY(ret))
@@ -741,10 +744,13 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	{
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 
 		struct _starpu_data_replicate *local_replicate;
 		struct _starpu_data_replicate *local_replicate;
 
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 
 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, index);
 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, index);
 
 
@@ -773,6 +779,9 @@ enomem:
 	{
 	{
 		starpu_data_handle_t handle = descrs[index2].handle;
 		starpu_data_handle_t handle = descrs[index2].handle;
 		enum starpu_data_access_mode mode = descrs[index2].mode;
 		enum starpu_data_access_mode mode = descrs[index2].mode;
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 
 		struct _starpu_data_replicate *local_replicate;
 		struct _starpu_data_replicate *local_replicate;
 
 
@@ -782,7 +791,7 @@ enomem:
 			 * _starpu_compar_handles */
 			 * _starpu_compar_handles */
 			continue;
 			continue;
 
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 
 		_starpu_release_data_on_node(handle, mask, local_replicate);
 		_starpu_release_data_on_node(handle, mask, local_replicate);
 	}
 	}
@@ -810,6 +819,9 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 	{
 	{
 		starpu_data_handle_t handle = descrs[index].handle;
 		starpu_data_handle_t handle = descrs[index].handle;
 		enum starpu_data_access_mode mode = descrs[index].mode;
 		enum starpu_data_access_mode mode = descrs[index].mode;
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 
 		struct _starpu_data_replicate *local_replicate;
 		struct _starpu_data_replicate *local_replicate;
 
 
@@ -819,7 +831,7 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 			 * _starpu_compar_handles */
 			 * _starpu_compar_handles */
 			continue;
 			continue;
 
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 
 		/* Keep a reference for future
 		/* Keep a reference for future
 		 * _starpu_release_task_enforce_sequential_consistency call */
 		 * _starpu_release_task_enforce_sequential_consistency call */

+ 12 - 0
tests/Makefile.am

@@ -200,6 +200,7 @@ noinst_PROGRAMS =				\
 	datawizard/wt_host			\
 	datawizard/wt_host			\
 	datawizard/wt_broadcast			\
 	datawizard/wt_broadcast			\
 	datawizard/readonly			\
 	datawizard/readonly			\
+	datawizard/specific_node		\
 	disk/disk_copy				\
 	disk/disk_copy				\
 	disk/disk_compute			\
 	disk/disk_compute			\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/starpu_init_noworker		\
@@ -384,6 +385,17 @@ datawizard_wt_host_SOURCES =			\
 datawizard_wt_broadcast_SOURCES =		\
 datawizard_wt_broadcast_SOURCES =		\
 	datawizard/wt_broadcast.c
 	datawizard/wt_broadcast.c
 
 
+datawizard_specific_node_SOURCES =		\
+	datawizard/specific_node.c
+if STARPU_USE_CUDA
+datawizard_specific_node_SOURCES +=		\
+	datawizard/cuda_codelet_unsigned_inc.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_specific_node_SOURCES +=			\
+	datawizard/opencl_codelet_unsigned_inc.c
+endif
+
 main_deprecated_func_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 main_deprecated_func_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 main_deprecated_buffer_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 main_deprecated_buffer_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 
 

+ 134 - 0
tests/datawizard/specific_node.c

@@ -0,0 +1,134 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <starpu.h>
+#include <stdlib.h>
+#include "../helper.h"
+
+starpu_data_handle_t data_handle;
+
+unsigned data;
+
+void specific_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	/* We do not protect this variable because it is only accessed when the
+	 * "data_handle" piece of data is accessed. */
+	int *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	STARPU_ASSERT(dataptr == &data);
+	*dataptr++;
+}
+
+static struct starpu_codelet specific_cl =
+{
+	.cpu_funcs = {specific_kernel, NULL},
+	.cuda_funcs = {specific_kernel, NULL},
+	.opencl_funcs = {specific_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_MAIN_RAM},
+};
+
+#ifdef STARPU_USE_CUDA
+void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
+#endif
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet_unsigned_inc(void *buffers[], void *args);
+#endif
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = {specific_kernel, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {opencl_codelet_unsigned_inc, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+};
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+#endif
+
+int main(STARPU_ATTRIBUTE_UNUSED int argc, STARPU_ATTRIBUTE_UNUSED char **argv)
+{
+#ifdef STARPU_QUICK_CHECK
+	int ntasks = 10;
+#else
+	int ntasks = 1000;
+#endif
+
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/opencl_codelet_unsigned_inc_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	data = 0;
+
+	/* Create a void data which will be used as an exclusion mechanism. */
+	starpu_variable_data_register(&data_handle, STARPU_MAIN_RAM, (uintptr_t) &data, sizeof(data));
+
+	int i;
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		if (i%2)
+			task->cl = &specific_cl;
+		else
+			task->cl = &cl;
+		task->handles[0] = data_handle;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_data_unregister(data_handle);
+
+	ret = (data == ntasks) ? EXIT_SUCCESS : EXIT_FAILURE;
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+
+	starpu_shutdown();
+
+	return ret;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}