Sfoglia il codice sorgente

Allow to have a dynamically allocated number of buffers per task, and
so overwrite the value defined --enable-maxbuffers=XXX

Nathalie Furmento 12 anni fa
parent
commit
097a832e3c

+ 2 - 0
ChangeLog

@@ -111,6 +111,8 @@ New features:
     pthread API. It is provided with 2 implementations: a pthread one
     and a Simgrid one. Applications using StarPU and wishing to use
     the Simgrid StarPU features should use it.
+  * Allow to have a dynamically allocated number of buffers per task,
+    and so overwrite the value defined --enable-maxbuffers=XXX
 
 Small features:
   * Add starpu_worker_get_by_type and starpu_worker_get_by_devid

+ 54 - 0
doc/chapters/advanced-examples.texi

@@ -23,6 +23,7 @@
 * Defining a New Scheduling Policy::
 * On-GPU rendering::
 * Defining a New Data Interface::
+* Setting the Data Handles for a Task::
 * More examples::               More examples shipped with StarPU
 @end menu
 
@@ -1264,6 +1265,59 @@ void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args
 
 The whole code for this complex data interface is available in the
 directory @code{examples/interface/}.
+
+@node Setting the Data Handles for a Task
+@section Setting the Data Handles for a Task
+
+The number of data a task can manage is fixed by the
+@code{STARPU_NMAXBUFS} which has a default value which can be changed
+through the configure option @code{--enable-maxbuffers} (see
+@ref{--enable-maxbuffers}).
+
+However, it is possible to define tasks managing more data by using
+the field @code{dyn_handles} when defining a task and the field
+@code{dyn_modes} when defining the corresponding codelet.
+
+@c modifier la doc pour starpu_task et starpu_codelet
+
+@cartouche
+@smallexample
+struct starpu_codelet dummy_big_cl =
+@{
+	.cuda_funcs = @{dummy_big_kernel, NULL@},
+	.opencl_funcs = @{dummy_big_kernel, NULL@},
+	.cpu_funcs = @{dummy_big_kernel, NULL@},
+	.nbuffers = STARPU_NMAXBUFS+1
+@};
+
+task = starpu_task_create();
+task->cl = &dummy_big_cl;
+task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<task->cl->nbuffers ; i++)
+@{
+	task->dyn_handles[i] = handle;
+@}
+starpu_task_submit(task);
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
+@{
+	handles[i] = handle;
+@}
+starpu_insert_task(&dummy_big_cl,
+        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
+		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
+		 0);
+@end smallexample
+@end cartouche
+
+The whole code for this complex data interface is available in the
+directory @code{examples/basic_examples/dynamic_handles.c}.
+
 @node More examples
 @section More examples
 

+ 26 - 0
doc/chapters/api.texi

@@ -1897,6 +1897,17 @@ exceed @code{STARPU_NMAXBUFS}.
 If unsufficient, this value can be set with the @code{--enable-maxbuffers}
 option when configuring StarPU.
 
+@item @code{enum starpu_access_mode *dyn_modes}
+Is an array of @code{enum starpu_access_mode}. It describes the
+required access modes to the data neeeded by the codelet (e.g.
+@code{STARPU_RW}). The number of entries in this array must be
+specified in the @code{nbuffers} field (defined above).
+This field should be used for codelets having a number of datas
+greater than @code{STARPU_NMAXBUFS} (@pxref{Setting the Data Handles
+for a Task}).
+When defining a codelet, one should either define this field or the
+field @code{modes} defined above. 
+
 @item @code{struct starpu_perfmodel *model} (optional)
 This is a pointer to the task duration performance model associated to this
 codelet. This optional field is ignored when set to @code{NULL} or
@@ -1982,10 +1993,25 @@ of entries in this array must be specified in the @code{nbuffers} field of the
 If unsufficient, this value can be set with the @code{--enable-maxbuffers}
 option when configuring StarPU.
 
+@item @code{starpu_data_handle_t *dyn_handles}
+Is an array of @code{starpu_data_handle_t}. It specifies the handles
+to the different pieces of data accessed by the task. The number
+of entries in this array must be specified in the @code{nbuffers} field of the
+@code{struct starpu_codelet} structure.
+This field should be used for tasks having a number of datas
+greater than @code{STARPU_NMAXBUFS} (@pxref{Setting the Data Handles
+for a Task}).
+When defining a task, one should either define this field or the
+field @code{handles} defined above.
+
 @item @code{void *interfaces[STARPU_NMAXBUFS]}
 The actual data pointers to the memory node where execution will happen, managed
 by the DSM.
 
+@item @code{void **dyn_interfaces}
+The actual data pointers to the memory node where execution will happen, managed
+by the DSM. Is used when the field @code{dyn_handles} is defined.
+
 @item @code{void *cl_arg} (optional; default: @code{NULL})
 This pointer is passed to the codelet through the second argument
 of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).

+ 1 - 0
doc/chapters/configuration.texi

@@ -234,6 +234,7 @@ Enable gathering of various data statistics (@pxref{Data statistics}).
 @end defvr
 
 @defvr {Configure option} --enable-maxbuffers
+@anchor{--enable-maxbuffers}
 Define the maximum number of buffers that tasks will be able to take
 as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
 @end defvr

+ 7 - 1
include/starpu_task.h

@@ -96,6 +96,7 @@ struct starpu_codelet
 	unsigned nbuffers;
 	/* which are the access modes for these buffers */
 	enum starpu_access_mode modes[STARPU_NMAXBUFS];
+	enum starpu_access_mode *dyn_modes;
 
 	/* performance model of the codelet */
 	struct starpu_perfmodel *model;
@@ -120,6 +121,9 @@ struct starpu_task
 	starpu_data_handle_t handles[STARPU_NMAXBUFS];
 	void *interfaces[STARPU_NMAXBUFS];
 
+	starpu_data_handle_t *dyn_handles;
+	void **dyn_interfaces;
+
 	/* arguments not managed by the DSM are given as a buffer */
 	void *cl_arg;
 	/* in case the argument buffer has to be uploaded explicitely */
@@ -240,7 +244,9 @@ struct starpu_task
 	.sched_ctx = 0,					\
 	.hypervisor_tag = 0,				\
 	.flops = 0.0,					\
-		.scheduled = 0				\
+	.scheduled = 0,					\
+	.dyn_handles = NULL,				\
+	.dyn_interfaces = NULL				\
 }
 
 /*

+ 4 - 0
mpi/src/starpu_mpi_insert_task.c

@@ -596,6 +596,10 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		_STARPU_MPI_DEBUG(1, "Execution of the codelet %p (%s)\n", codelet, codelet->name);
 		va_start(varg_list, codelet);
 		struct starpu_task *task = starpu_task_create();
+		if (codelet->nbuffers > STARPU_NMAXBUFS)
+		{
+			task->dyn_handles = malloc(cl->nbuffers * sizeof(starpu_data_handle_t));
+		}
 		int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
 		STARPU_ASSERT_MSG(ret==0, "_starpu_insert_task_create_and_submit failure %d", ret);
 	}

+ 18 - 11
src/core/dependencies/data_concurrency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -192,8 +192,8 @@ static unsigned attempt_to_submit_data_request_from_job(struct _starpu_job *j, u
 {
 	/* Note that we do not access j->task->handles, but j->ordered_buffers
 	 * which is a sorted copy of it. */
-	starpu_data_handle_t handle = j->ordered_buffers[buffer_index].handle;
-	enum starpu_access_mode mode = j->ordered_buffers[buffer_index].mode;
+	starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buffer_index);
+	enum starpu_access_mode mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, buffer_index);
 
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 }
@@ -205,11 +205,16 @@ static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned st
 	unsigned nbuffers = j->task->cl->nbuffers;
 	for (buf = start_buffer_index; buf < nbuffers; buf++)
 	{
-		if (buf && j->ordered_buffers[buf-1].handle == j->ordered_buffers[buf].handle)
-			/* We have already requested this data, skip it. This
-			 * depends on ordering putting writes before reads, see
-			 * _starpu_compar_handles.  */
-			continue;
+		if (buf)
+		{
+			starpu_data_handle_t handle_m1 = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buf-1);
+			starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buf);
+			if (handle_m1 == handle)
+				/* We have already requested this data, skip it. This
+				 * depends on ordering putting writes before reads, see
+				 * _starpu_compar_handles.  */
+				continue;
+		}
 
                 j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
                 if (attempt_to_submit_data_request_from_job(j, buf))
@@ -238,11 +243,13 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 	unsigned i;
 	for (i=0 ; i<cl->nbuffers ; i++)
 	{
-		j->ordered_buffers[i].handle = j->task->handles[i];
-		j->ordered_buffers[i].mode = j->task->cl->modes[i];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(j->task, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 	}
 
-	_starpu_sort_task_handles(j->ordered_buffers, cl->nbuffers);
+	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), cl->nbuffers);
 
 	return _submit_job_enforce_data_deps(j, 0);
 }

+ 3 - 3
src/core/dependencies/implicit_data_deps.c

@@ -336,8 +336,8 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	unsigned buffer;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		starpu_data_handle_t handle = task->handles[buffer];
-		enum starpu_access_mode mode = task->cl->modes[buffer];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, buffer);
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(task->cl, buffer);
 		struct starpu_task *new_task;
 
 		/* Scratch memory does not introduce any deps */
@@ -457,7 +457,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
-        struct starpu_buffer_descr *descrs = j->ordered_buffers;
+        struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
 
 	if (!task->cl)
 		return;

+ 14 - 3
src/core/jobs.c

@@ -52,6 +52,9 @@ struct _starpu_job* __attribute__((malloc)) _starpu_job_create(struct starpu_tas
 	 * everywhere */
 	memset(job, 0, sizeof(*job));
 
+	if (task->dyn_handles)
+	     job->dyn_ordered_buffers = malloc(task->cl->nbuffers * sizeof(struct starpu_buffer_descr));
+
 	job->task = task;
 
 #ifndef STARPU_USE_FXT
@@ -104,6 +107,11 @@ void _starpu_job_destroy(struct _starpu_job *j)
 	}
 
 	_starpu_cg_list_deinit(&j->job_successors);
+	if (j->dyn_ordered_buffers)
+	{
+	     free(j->dyn_ordered_buffers);
+	     j->dyn_ordered_buffers = NULL;
+	}
 
 	_starpu_job_delete(j);
 }
@@ -149,8 +157,11 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	int i;
 	size_t data_size = 0;
 	for(i = 0; i < STARPU_NMAXBUFS; i++)
-		if(task->handles[i] != NULL)
-			data_size += _starpu_data_get_size(task->handles[i]);
+	{
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, i);
+		if (handle != NULL)
+			data_size += _starpu_data_get_size(handle);
+	}
 #endif //STARPU_USE_SC_HYPERVISOR
 
 	/* We release handle reference count */
@@ -159,7 +170,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		unsigned i;
 		for (i=0; i<task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
 			handle->busy_count--;
 			if (!_starpu_data_check_not_busy(handle))

+ 10 - 0
src/core/jobs.h

@@ -70,6 +70,7 @@ LIST_TYPE(_starpu_job,
 	 * the task so that we always grab the rw-lock associated to the
 	 * handles in the same order. */
 	struct starpu_buffer_descr ordered_buffers[STARPU_NMAXBUFS];
+	struct starpu_buffer_descr *dyn_ordered_buffers;
 
 	/* If a tag is associated to the job, this points to the internal data
 	 * structure that describes the tag status. */
@@ -172,4 +173,13 @@ struct starpu_task *_starpu_pop_local_task(struct _starpu_worker *worker);
  * enforce a FIFO ordering. */
 int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *task, int back);
 
+#define _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].handle : job->ordered_buffers[i].handle)
+#define _STARPU_JOB_GET_ORDERED_BUFFER_MODE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].mode : job->ordered_buffers[i].mode)
+
+#define _STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(job, handle, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].handle = (handle); else job->ordered_buffers[i].handle = (handle);} while(0)
+#define _STARPU_JOB_SET_ORDERED_BUFFER_MODE(job, mode, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].mode = mode; else job->ordered_buffers[i].mode = mode;} while(0)
+
+#define _STARPU_JOB_SET_ORDERED_BUFFER(job, buffer, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i] = buffer; else job->ordered_buffers[i] = buffer;} while(0)
+#define _STARPU_JOB_GET_ORDERED_BUFFERS(job) (job->dyn_ordered_buffers) ? job->dyn_ordered_buffers : job->ordered_buffers
+
 #endif // __JOBS_H__

+ 5 - 5
src/core/perfmodel/perfmodel.c

@@ -227,7 +227,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 		starpu_data_handle_t handle;
 		struct starpu_task *conversion_task;
 
-		handle = task->handles[i];
+		handle = _STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
 
@@ -287,8 +287,8 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		starpu_data_handle_t handle = task->handles[buffer];
-		enum starpu_access_mode mode = task->cl->modes[buffer];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, buffer);
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(task->cl, buffer);
 
 		penalty += starpu_data_expected_transfer_time(handle, memory_node, mode);
 	}
@@ -375,8 +375,8 @@ double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundl
 			unsigned b;
 			for (b = 0; b < task->cl->nbuffers; b++)
 			{
-				starpu_data_handle_t handle = task->handles[b];
-				enum starpu_access_mode mode = task->cl->modes[b];
+				starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, b);
+				enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(task->cl, b);
 
 				if (!(mode & STARPU_R))
 					continue;

+ 2 - 2
src/core/perfmodel/perfmodel_history.c

@@ -72,7 +72,7 @@ size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_per
 		unsigned buffer;
 		for (buffer = 0; buffer < nbuffers; buffer++)
 		{
-			starpu_data_handle_t handle = task->handles[buffer];
+			starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, buffer);
 			size += _starpu_data_get_size(handle);
 		}
 		return size;
@@ -1267,7 +1267,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 		for (i = 0; i < task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, i);
 
 			STARPU_ASSERT(handle->ops);
 			STARPU_ASSERT(handle->ops->display);

+ 8 - 5
src/core/sched_policy.c

@@ -236,7 +236,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 				struct starpu_task *conversion_task;
 				starpu_data_handle_t handle;
 
-				handle = task->handles[i];
+				handle = _STARPU_TASK_GET_HANDLE(task, i);
 				if (!_starpu_handle_needs_conversion_task(handle, node))
 					continue;
 
@@ -249,7 +249,10 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 			}
 
 			for (i = 0; i < task->cl->nbuffers; i++)
-				task->handles[i]->mf_node = node;
+			{
+				starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, i);
+				handle->mf_node = node;
+			}
 		}
 //		if(task->sched_ctx != _starpu_get_initial_sched_ctx()->id)
 
@@ -447,7 +450,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
 	conversion_task = starpu_task_create();
 	conversion_task->synchronous = 0;
-	conversion_task->handles[0] = handle;
+	_STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	/* The node does not really matter here */
@@ -510,7 +513,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 		STARPU_ABORT();
 	}
 
-	conversion_task->cl->modes[0] = STARPU_RW;
+	_STARPU_CODELET_SET_MODE(conversion_task->cl, STARPU_RW, 0);
 	return conversion_task;
 }
 
@@ -663,7 +666,7 @@ pick:
 		struct starpu_task *conversion_task;
 		starpu_data_handle_t handle;
 
-		handle = task->handles[i];
+		handle = _STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_handle_needs_conversion_task(handle, node))
 			continue;
 		conversion_task = _starpu_create_conversion_task(handle, node);

+ 34 - 10
src/core/task.c

@@ -77,6 +77,11 @@ void starpu_task_init(struct starpu_task *task)
 	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
 
 	task->flops = 0.0;
+
+	task->scheduled = 0;
+
+	task->dyn_handles = NULL;
+	task->dyn_interfaces = NULL;
 }
 
 /* Free all the ressources allocated for a task, without deallocating the task
@@ -99,6 +104,14 @@ void starpu_task_clean(struct starpu_task *task)
 	if (bundle)
 		starpu_task_bundle_remove(bundle, task);
 
+	if (task->dyn_handles)
+	{
+		free(task->dyn_handles);
+		task->dyn_handles = NULL;
+		free(task->dyn_interfaces);
+		task->dyn_interfaces = NULL;
+	}
+
 	struct _starpu_job *j = (struct _starpu_job *)task->starpu_private;
 
 	if (j)
@@ -229,7 +242,7 @@ int _starpu_submit_job(struct _starpu_job *j)
 		unsigned i;
 		for (i=0; i<task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
 			handle->busy_count++;
 			_starpu_spin_unlock(&handle->header_lock);
@@ -393,16 +406,23 @@ int starpu_task_submit(struct starpu_task *task)
 		unsigned i;
 
 		/* Check buffers */
-		STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d)", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
+		if (task->dyn_handles == NULL)
+			STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d)", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
+
+		if (task->dyn_handles)
+		{
+			task->dyn_interfaces = malloc(task->cl->nbuffers * sizeof(void *));
+		}
+
 		for (i = 0; i < task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, i);
 			/* Make sure handles are not partitioned */
 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data can be used in a task");
 			/* Provide the home interface for now if any,
 			 * for can_execute hooks */
 			if (handle->home_node != -1)
-				task->interfaces[i] = starpu_data_get_interface_on_node(task->handles[i], handle->home_node);
+				_STARPU_TASK_SET_INTERFACE(task, starpu_data_get_interface_on_node(handle, handle->home_node), i);
 		}
 
 		/* Check the type of worker(s) required by the task exist */
@@ -526,8 +546,10 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 		unsigned i;
 		for (i=0 ; i<task->cl->nbuffers ; i++)
 		{
-			j->ordered_buffers[i].handle = j->task->handles[i];
-			j->ordered_buffers[i].mode = j->task->cl->modes[i];
+			starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(j->task, i);
+			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
+			enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(j->task->cl, i);
+			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		}
 	}
 
@@ -559,7 +581,7 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	unsigned i;
 	for (i=0; i<task->cl->nbuffers; i++)
 	{
-		starpu_data_handle_t handle = task->handles[i];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, i);
 		_starpu_spin_lock(&handle->header_lock);
 		handle->busy_count++;
 		_starpu_spin_unlock(&handle->header_lock);
@@ -574,8 +596,10 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
 	for (i=0 ; i<task->cl->nbuffers ; i++)
 	{
-		j->ordered_buffers[i].handle = j->task->handles[i];
-		j->ordered_buffers[i].mode = j->task->cl->modes[i];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(j->task, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 	}
 
         _STARPU_LOG_IN();
@@ -811,7 +835,7 @@ _starpu_task_uses_multiformat_handles(struct starpu_task *task)
 	unsigned i;
 	for (i = 0; i < task->cl->nbuffers; i++)
 	{
-		if (_starpu_data_is_multiformat_handle(task->handles[i]))
+		if (_starpu_data_is_multiformat_handle(_STARPU_TASK_GET_HANDLE(task, i)))
 			return 1;
 	}
 

+ 9 - 0
src/core/task.h

@@ -73,4 +73,13 @@ starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet
 starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 
+#define _STARPU_TASK_GET_HANDLE(task, i) ((task->dyn_handles) ? task->dyn_handles[i] : task->handles[i])
+#define _STARPU_TASK_SET_HANDLE(task, handle, i) do { if (task->dyn_handles) task->dyn_handles[i] = handle; else task->handles[i] = handle; } while(0)
+
+#define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
+#define _STARPU_TASK_GET_INTERFACES(task) (task->dyn_handles) ? task->dyn_interfaces : task->interfaces
+
+#define _STARPU_CODELET_GET_MODE(codelet, i) (codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i]
+#define _STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
+
 #endif // __CORE_TASK_H__

+ 8 - 7
src/datawizard/coherency.c

@@ -22,6 +22,7 @@
 #include <core/dependencies/data_concurrency.h>
 #include <profiling/profiling.h>
 #include <math.h>
+#include <core/task.h>
 
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
 unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
@@ -591,8 +592,8 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 
 	for (index = 0; index < nbuffers; index++)
 	{
-		starpu_data_handle_t handle = task->handles[index];
-		enum starpu_access_mode mode = task->cl->modes[index];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, index);
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(task->cl, index);
 
 		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 			continue;
@@ -624,7 +625,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	if (profiling && task->profiling_info)
 		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
 
-	struct starpu_buffer_descr *descrs = j->ordered_buffers;
+	struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
 	unsigned nbuffers = task->cl->nbuffers;
 
 	unsigned local_memory_node = _starpu_memory_node_get_local_key();
@@ -656,14 +657,14 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	/* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order.  */
 	for (index = 0; index < nbuffers; index++)
 	{
-		starpu_data_handle_t handle = task->handles[index];
-		enum starpu_access_mode mode = task->cl->modes[index];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, index);
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(task->cl, index);
 
 		struct _starpu_data_replicate *local_replicate;
 
 		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
 
-		task->interfaces[index] = local_replicate->data_interface;
+		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, index);
 
 		if (mode & STARPU_REDUX)
 		{
@@ -699,7 +700,7 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 	if (profiling && task->profiling_info)
 		_starpu_clock_gettime(&task->profiling_info->release_data_start_time);
 
-        struct starpu_buffer_descr *descrs = j->ordered_buffers;
+        struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
         unsigned nbuffers = task->cl->nbuffers;
 
 	int workerid = starpu_worker_get_id();

+ 1 - 1
src/datawizard/filters.c

@@ -305,7 +305,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 				.nbuffers = 1
 			};
 			struct starpu_task *task = starpu_task_create();
-			task->handles[0] = child_handle;
+			_STARPU_TASK_SET_HANDLE(task, child_handle, 0);
 			task->cl = &cl;
 			task->synchronous = 1;
 			if (_starpu_task_submit_internally(task) != 0)

+ 1 - 1
src/datawizard/footprint.c

@@ -43,7 +43,7 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 	{
 		for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
 		{
-			starpu_data_handle_t handle = task->handles[buffer];
+			starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, buffer);
 
 			uint32_t handle_footprint = _starpu_data_get_footprint(handle);
 

+ 5 - 5
src/datawizard/reduction.c

@@ -225,8 +225,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 					STARPU_ASSERT_MSG(redux_task->cl->modes[0] == STARPU_RW, "First parameter of reduction codelet has to be RW");
 					STARPU_ASSERT_MSG(redux_task->cl->modes[1] == STARPU_R, "Second parameter of reduction codelet has to be R");
 
-					redux_task->handles[0] = replicate_array[i];
-					redux_task->handles[1] = replicate_array[i+step];
+					_STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
+					_STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);
 
 					int ndeps = 0;
 					struct starpu_task *task_deps[2];
@@ -281,7 +281,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 			if (!redux_task->cl->modes[0])
 				redux_task->cl->modes[0] = STARPU_W;
 			STARPU_ASSERT_MSG(redux_task->cl->modes[0] == STARPU_W, "Parameter of initialization codelet has to be W");
-			redux_task->handles[0] = handle;
+			_STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
 
 			int ret = _starpu_task_submit_internally(redux_task);
 			STARPU_ASSERT(!ret);
@@ -311,8 +311,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 			STARPU_ASSERT_MSG(redux_task->cl->modes[0] == STARPU_RW, "First parameter of reduction codelet has to be RW");
 			STARPU_ASSERT_MSG(redux_task->cl->modes[1] == STARPU_R, "Second parameter of reduction codelet has to be R");
 
-			redux_task->handles[0] = handle;
-			redux_task->handles[1] = replicate_array[replicate];
+			_STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
+			_STARPU_TASK_SET_HANDLE(redux_task, replicate_array[replicate], 1);
 
 			int ret = _starpu_task_submit_internally(redux_task);
 			STARPU_ASSERT(!ret);

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -158,7 +158,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 #ifdef STARPU_SIMGRID
 		_starpu_simgrid_execute_job(j, perf_arch, NAN);
 #else
-		func(task->interfaces, task->cl_arg);
+		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 			/* rebind to single CPU */

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -353,7 +353,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 #ifdef STARPU_SIMGRID
 	_starpu_simgrid_execute_job(j, args->perf_arch, NAN);
 #else
-	func(task->interfaces, task->cl_arg);
+	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 
 	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);

+ 3 - 3
src/drivers/gordon/driver_gordon.c

@@ -102,7 +102,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	unsigned nbuffers = cl->nbuffers;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		enum starpu_access_mode mode = cl->modes[buffer];
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(cl, buffer);
 
 		switch (mode)
 		{
@@ -122,7 +122,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		unsigned gordon_buffer;
-		enum starpu_access_mode mode = cl->modes[buffer];
+		enum starpu_access_mode mode = _STARPU_CODELET_GET_MODE(cl, buffer);
 
 		switch (mode)
 		{
@@ -138,7 +138,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 				break;
 		}
 
-		starpu_data_handle_t handle = task->handles[buffer];
+		starpu_data_handle_t handle = _STARPU_TASK_GET_HANDLE(task, buffer);
 
 		gordon_job->nalloc = 0;
 		gordon_job->nin = nin;

+ 2 - 2
src/drivers/opencl/driver_opencl.c

@@ -824,7 +824,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 #ifdef STARPU_SIMGRID
 	double length = NAN;
   #ifdef STARPU_OPENCL_SIMULATOR
-	func(task->interfaces, task->cl_arg);
+	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
     #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
       #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
         #define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
@@ -838,7 +838,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
   #endif
 	_starpu_simgrid_execute_job(j, args->perf_arch, length);
 #else
-	func(task->interfaces, task->cl_arg);
+	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 
 	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);

+ 1 - 1
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -80,7 +80,7 @@ static int count_non_ready_buffers(struct starpu_task *task, unsigned node)
 	{
 		starpu_data_handle_t handle;
 
-		handle = task->handles[index];
+		handle = _STARPU_TASK_GET_HANDLE(task, index);
 
 		int is_valid;
 		starpu_data_query_status(handle, node, NULL, &is_valid, NULL);

+ 2 - 2
src/util/starpu_data_cpy.c

@@ -103,8 +103,8 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 	task->callback_func = callback_func;
 	task->callback_arg = callback_arg;
 
-	task->handles[0] = dst_handle;
-	task->handles[1] = src_handle;
+	_STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
+	_STARPU_TASK_SET_HANDLE(task, src_handle, 1);
 
 	task->synchronous = !asynchronous;
 

+ 7 - 1
src/util/starpu_insert_task.c

@@ -79,8 +79,14 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 		_starpu_codelet_pack_args((char **)&arg_buffer, arg_buffer_size, varg_list);
 	}
 
-	va_start(varg_list, cl);
 	struct starpu_task *task = starpu_task_create();
+
+	if (cl->nbuffers > STARPU_NMAXBUFS)
+	{
+		task->dyn_handles = malloc(cl->nbuffers * sizeof(starpu_data_handle_t));
+	}
+
+	va_start(varg_list, cl);
 	int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, cl, &task, varg_list);
 
 	if (ret == -ENODEV)

+ 3 - 2
src/util/starpu_insert_task_utils.c

@@ -18,6 +18,7 @@
 #include <util/starpu_insert_task_utils.h>
 #include <common/config.h>
 #include <common/utils.h>
+#include <core/task.h>
 
 typedef void (*_starpu_callback_func_t)(void *);
 
@@ -239,7 +240,7 @@ int _starpu_insert_task_create_and_submit(void *arg_buffer, size_t arg_buffer_si
 
 			STARPU_ASSERT(cl != NULL);
 
-			(*task)->handles[current_buffer] = handle;
+			_STARPU_TASK_SET_HANDLE((*task), handle, current_buffer);
 			if (cl->modes[current_buffer])
 			{
 				STARPU_ASSERT_MSG(cl->modes[current_buffer] == mode, "The codelet <%s> defines the access mode %d for the buffer %d which is different from the mode %d given to starpu_insert_task\n",
@@ -264,7 +265,7 @@ int _starpu_insert_task_create_and_submit(void *arg_buffer, size_t arg_buffer_si
 			int i;
 			for(i=0 ; i<nb_handles ; i++)
 			{
-				(*task)->handles[current_buffer] = handles[i];
+				_STARPU_TASK_SET_HANDLE((*task), handles[i], current_buffer);
 				current_buffer++;
 			}