Explorar o código

* New codelet specific_nodes field to specify explicit target nodes for data.

Samuel Thibault %!s(int64=11) %!d(string=hai) anos
pai
achega
fedd296373

+ 1 - 0
ChangeLog

@@ -42,6 +42,7 @@ New features:
     sequential consistency defined for the given data)
   * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
   * New functions starpu_pause() and starpu_resume()
+  * New codelet specific_nodes field to specify explicit target nodes for data.
 
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and

+ 25 - 0
doc/doxygen/chapters/07data_management.doxy

@@ -504,5 +504,30 @@ The whole code for this complex data interface is available in the
 directory <c>examples/interface/</c>.
 
 
+\section SpecifyingATargetNode Specifying a target node for task data
+
+When executing a task on a GPU for instance, StarPU would normally copy all the
+needed data for the tasks on the embedded memory of the GPU.  It may however
+happen that the task kernel would rather have some of the datas kept in the
+main memory instead of copied in the GPU, a pivoting vector for instance.
+This can be achieved by setting the starpu_codelet::specific_nodes flag to
+1, and then fill the starpu_codelet::nodes array (or starpu_codelet when
+starpu_codelet::nbuffers is greater than STARPU_NMAXBUFS) with the node numbers
+where data should be copied to, or -1 to let StarPU copy it to the memory node
+where the task will be executed . For instance, with the following codelet:
+
+\code{.c}
+struct starpu_codelet cl =
+{
+	.cuda_funcs = { kernel, NULL },
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_MAIN_RAM, -1},
+};
+\endcode
+
+the first data of the task will be kept in the main memory, while the second
+data will be copied to the CUDA GPU as usual.
 
 */

+ 24 - 1
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -278,13 +278,36 @@ unsufficient, this value can be set with the configure option
 
 \var starpu_codelet::dyn_modes
 Is an array of ::starpu_data_access_mode. It describes the required
-access modes to the data neeeded by the codelet (e.g. ::STARPU_RW).
+access modes to the data needed by the codelet (e.g. ::STARPU_RW).
 The number of entries in this array must be specified in the field
 starpu_codelet::nbuffers. This field should be used for codelets having a
 number of datas greater than \ref STARPU_NMAXBUFS (see \ref
 SettingTheDataHandlesForATask). When defining a codelet, one
 should either define this field or the field starpu_codelet::modes defined above.
 
+\var starpu_codelet::specific_nodes.
+Default value is 0. If this flag is set, StarPU will not systematically
+send all data to the memory node where the task will be executing, it
+will read the starpu_codelet::nodes or starpu_codelet::dyn_nodes array to
+determine, for each data, whether to send it on the memory node where the task
+will be executing (-1), or on a specific node (!= -1).
+
+\var starpu_codelet::nodes.
+Optional field. When starpu_codelet::specific_nodes is 1, this specifies
+the memory nodes where each data should be sent to for task execution.
+The number of entries in this array is starpu_codelet::nbuffers, and should
+not exceed \ref STARPU_NMAXBUFS.
+
+\var starpu_codelet::dyn_nodes
+Optional field. When starpu_codelet::specific_nodes is 1, this specifies
+the memory nodes where each data should be sent to for task execution.
+The number of entries in this array is starpu_codelet::nbuffers.
+This field should be used for codelets having a
+number of datas greater than \ref STARPU_NMAXBUFS (see \ref
+SettingTheDataHandlesForATask). When defining a codelet, one
+should either define this field or the field starpu_codelet::nodes defined
+above.
+
 \var starpu_codelet::model
 Optional pointer to the task duration performance model associated to
 this codelet. This optional field is ignored when set to <c>NULL</c> or when

+ 8 - 1
include/starpu_task.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
@@ -100,6 +100,10 @@ struct starpu_codelet
 	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 	enum starpu_data_access_mode *dyn_modes;
 
+	unsigned specific_nodes;
+	int nodes[STARPU_NMAXBUFS];
+	int *dyn_nodes;
+
 	struct starpu_perfmodel *model;
 	struct starpu_perfmodel *power_model;
 
@@ -209,6 +213,9 @@ struct starpu_task
 #define STARPU_CODELET_GET_MODE(codelet, i) ((codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i])
 #define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
 
+#define STARPU_CODELET_GET_NODE(codelet, i) ((codelet->dyn_nodes) ? codelet->dyn_nodes[i] : codelet->nodes[i])
+#define STARPU_CODELET_SET_NODE(codelet, __node, i) do { if (codelet->dyn_nodes) codelet->dyn_nodes[i] = __node; else codelet->nodes[i] = __node; } while(0)
+
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);
 

+ 5 - 1
src/core/dependencies/data_concurrency.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -254,6 +254,10 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
+		int node = -1;
+		if (j->task->cl->specific_nodes)
+			node = STARPU_CODELET_GET_NODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 	}
 
 	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), cl->nbuffers);

+ 4 - 1
src/core/jobs.h

@@ -56,6 +56,7 @@ typedef void (*_starpu_cl_func_t)(void **, void *);
 struct _starpu_data_descr {
 	starpu_data_handle_t handle;
 	enum starpu_data_access_mode mode;
+	int node;
 };
 
 /* A job is the internal representation of a task. */
@@ -180,9 +181,11 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 
 #define _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].handle : job->ordered_buffers[i].handle)
 #define _STARPU_JOB_GET_ORDERED_BUFFER_MODE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].mode : job->ordered_buffers[i].mode)
+#define _STARPU_JOB_GET_ORDERED_BUFFER_NODE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].node : job->ordered_buffers[i].node)
 
 #define _STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(job, handle, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].handle = (handle); else job->ordered_buffers[i].handle = (handle);} while(0)
-#define _STARPU_JOB_SET_ORDERED_BUFFER_MODE(job, mode, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].mode = mode; else job->ordered_buffers[i].mode = mode;} while(0)
+#define _STARPU_JOB_SET_ORDERED_BUFFER_MODE(job, __mode, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].mode = __mode; else job->ordered_buffers[i].mode = __mode;} while(0)
+#define _STARPU_JOB_SET_ORDERED_BUFFER_NODE(job, __node, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].node = __node; else job->ordered_buffers[i].node = __node;} while(0)
 
 #define _STARPU_JOB_SET_ORDERED_BUFFER(job, buffer, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i] = buffer; else job->ordered_buffers[i] = buffer;} while(0)
 #define _STARPU_JOB_GET_ORDERED_BUFFERS(job) (job->dyn_ordered_buffers) ? job->dyn_ordered_buffers : job->ordered_buffers

+ 9 - 1
src/core/task.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
@@ -599,6 +599,10 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 			enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
+			int node = -1;
+			if (j->task->cl->specific_nodes)
+				node = STARPU_CODELET_GET_NODE(j->task->cl, i);
+			_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 		}
 	}
 
@@ -659,6 +663,10 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
+		int node = -1;
+		if (j->task->cl->specific_nodes)
+			node = STARPU_CODELET_GET_NODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 	}
 
         _STARPU_LOG_IN();

+ 18 - 6
src/datawizard/coherency.c

@@ -678,13 +678,13 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 	return 0;
 }
 
-static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned local_memory_node)
+static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
 {
 	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 		return &handle->per_worker[workerid];
 	else
 		/* That's a "normal" buffer (R/W) */
-		return &handle->per_node[local_memory_node];
+		return &handle->per_node[node];
 }
 
 int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
@@ -713,6 +713,9 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 		int ret;
 		starpu_data_handle_t handle = descrs[index].handle;
 		enum starpu_data_access_mode mode = descrs[index].mode;
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 		struct _starpu_data_replicate *local_replicate;
 
@@ -722,7 +725,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 			 * _starpu_compar_handles */
 			continue;
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 		ret = fetch_data(handle, local_replicate, mode);
 		if (STARPU_UNLIKELY(ret))
@@ -741,10 +744,13 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
 		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 		struct _starpu_data_replicate *local_replicate;
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, index);
 
@@ -773,6 +779,9 @@ enomem:
 	{
 		starpu_data_handle_t handle = descrs[index2].handle;
 		enum starpu_data_access_mode mode = descrs[index2].mode;
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 		struct _starpu_data_replicate *local_replicate;
 
@@ -782,7 +791,7 @@ enomem:
 			 * _starpu_compar_handles */
 			continue;
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 		_starpu_release_data_on_node(handle, mask, local_replicate);
 	}
@@ -810,6 +819,9 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 	{
 		starpu_data_handle_t handle = descrs[index].handle;
 		enum starpu_data_access_mode mode = descrs[index].mode;
+		int node = descrs[index].node;
+		if (node == -1)
+			node = local_memory_node;
 
 		struct _starpu_data_replicate *local_replicate;
 
@@ -819,7 +831,7 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 			 * _starpu_compar_handles */
 			continue;
 
-		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
+		local_replicate = get_replicate(handle, mode, workerid, node);
 
 		/* Keep a reference for future
 		 * _starpu_release_task_enforce_sequential_consistency call */

+ 12 - 0
tests/Makefile.am

@@ -200,6 +200,7 @@ noinst_PROGRAMS =				\
 	datawizard/wt_host			\
 	datawizard/wt_broadcast			\
 	datawizard/readonly			\
+	datawizard/specific_node		\
 	disk/disk_copy				\
 	disk/disk_compute			\
 	errorcheck/starpu_init_noworker		\
@@ -384,6 +385,17 @@ datawizard_wt_host_SOURCES =			\
 datawizard_wt_broadcast_SOURCES =		\
 	datawizard/wt_broadcast.c
 
+datawizard_specific_node_SOURCES =		\
+	datawizard/specific_node.c
+if STARPU_USE_CUDA
+datawizard_specific_node_SOURCES +=		\
+	datawizard/cuda_codelet_unsigned_inc.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_specific_node_SOURCES +=			\
+	datawizard/opencl_codelet_unsigned_inc.c
+endif
+
 main_deprecated_func_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 main_deprecated_buffer_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 

+ 134 - 0
tests/datawizard/specific_node.c

@@ -0,0 +1,134 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <starpu.h>
+#include <stdlib.h>
+#include "../helper.h"
+
+starpu_data_handle_t data_handle;
+
+unsigned data;
+
+void specific_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	/* We do not protect this variable because it is only accessed when the
+	 * "data_handle" piece of data is accessed. */
+	int *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	STARPU_ASSERT(dataptr == &data);
+	*dataptr++;
+}
+
+static struct starpu_codelet specific_cl =
+{
+	.cpu_funcs = {specific_kernel, NULL},
+	.cuda_funcs = {specific_kernel, NULL},
+	.opencl_funcs = {specific_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_MAIN_RAM},
+};
+
+#ifdef STARPU_USE_CUDA
+void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
+#endif
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet_unsigned_inc(void *buffers[], void *args);
+#endif
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = {specific_kernel, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {opencl_codelet_unsigned_inc, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+};
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+#endif
+
+int main(STARPU_ATTRIBUTE_UNUSED int argc, STARPU_ATTRIBUTE_UNUSED char **argv)
+{
+#ifdef STARPU_QUICK_CHECK
+	int ntasks = 10;
+#else
+	int ntasks = 1000;
+#endif
+
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/opencl_codelet_unsigned_inc_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	data = 0;
+
+	/* Create a void data which will be used as an exclusion mechanism. */
+	starpu_variable_data_register(&data_handle, STARPU_MAIN_RAM, (uintptr_t) &data, sizeof(data));
+
+	int i;
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		if (i%2)
+			task->cl = &specific_cl;
+		else
+			task->cl = &cl;
+		task->handles[0] = data_handle;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_data_unregister(data_handle);
+
+	ret = (data == ntasks) ? EXIT_SUCCESS : EXIT_FAILURE;
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+
+	starpu_shutdown();
+
+	return ret;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}