8 lat temu · d6e4271db8
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -538,8 +538,9 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
				 	starpu_pthread_wait_init(&workerarg->wait);
			
 
				 	starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_task_queue[workerarg->workerid]);
			
 
				 #endif
			
 
				-        workerarg->task_sending = NULL;
			
 
				-        workerarg->nb_buffers_sent = 0;
			
 
				+	workerarg->task_transferring = NULL;
			
 
				+	workerarg->nb_buffers_transferred = 0;
			
 
				+	workerarg->nb_buffers_totransfer = 0;
			
 
				 
			
 
				 	workerarg->first_task = 0;
			
 
				 	workerarg->ntasks = 0;
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -115,8 +115,9 @@ LIST_TYPE(_starpu_worker,
 
				 
			
 
				 	unsigned spinning_backoff ; /* number of cycles to pause when spinning  */
			
 
				 
			
 
				-        unsigned nb_buffers_sent; /* number of piece of data already send to remote side */
			
 
				-        struct starpu_task *task_sending; /* The buffers of this task are being sent */
			
 
				+	unsigned nb_buffers_transferred; /* number of piece of data already send to worker */
			
 
				+	unsigned nb_buffers_totransfer; /* number of piece of data already send to worker */
			
 
				+	struct starpu_task *task_transferring; /* The buffers of this task are being sent */
			
 
				 
			
 
				 	/* indicate whether the workers shares tasks lists with other workers*/
			
 
				 	/* in this case when removing him from a context it disapears instantly */
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -931,28 +931,57 @@ struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum s
 
				 		return &handle->per_node[node];
			
 
				 }
			
 
				 
			
 
				-/* Synchronously fetch data for a given task (if it's not there already) */
			
 
				-int _starpu_fetch_task_input(struct _starpu_job *j)
			
 
				+/* Callback used when a buffer is send asynchronously to the sink */
			
 
				+static void _starpu_fetch_task_input_cb(void *arg)
			
 
				 {
			
 
				-	_STARPU_TRACE_START_FETCH_INPUT(NULL);
			
 
				+   struct _starpu_worker * worker = (struct _starpu_worker *) arg;
			
 
				+
			
 
				+   /* increase the number of buffer received */
			
 
				+   STARPU_WMB();
			
 
				+   (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_transferred, 1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Synchronously or asynchronously fetch data for a given task (if it's not there already) 
			
 
				+ * Returns the number of data acquired here.  */
			
 
				+
			
 
				+/* The synchronous version of _starpu_fetch_task_input must be called before
			
 
				+ * executing the task. __starpu_push_task_output but be called after the
			
 
				+ * execution of the task. */
			
 
				+/* To improve overlapping, the driver can, before calling the synchronous
			
 
				+ * version of _starpu_fetch_task_input, call _starpu_fetch_task_input with
			
 
				+ * async==1, then wait for transfers to complete, then call
			
 
				+ * _starpu_release_fetch_task_input_async to release them before calling the
			
 
				+ * synchronous version of _starpu_fetch_task_input. */
			
 
				+int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async)
			
 
				+{
			
 
				+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				+	int workerid = worker->workerid;
			
 
				+	if (async)
			
 
				+	{
			
 
				+		worker->task_transferring = task;
			
 
				+		worker->nb_buffers_transferred = 0;
			
 
				+		_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
			
 
				+	}
			
 
				+	else
			
 
				+		_STARPU_TRACE_START_FETCH_INPUT(NULL);
			
 
				 
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				-	struct starpu_task *task = j->task;
			
 
				 	if (profiling && task->profiling_info)
			
 
				 		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
			
 
				 
			
 
				 	struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
			
 
				 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				+	unsigned nacquires;
			
 
				 
			
 
				 	unsigned local_memory_node = _starpu_memory_node_get_local_key();
			
 
				 
			
 
				-	int workerid = starpu_worker_get_id_check();
			
 
				-
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	unsigned long total_size = 0;
			
 
				 #endif
			
 
				 
			
 
				 	unsigned index;
			
 
				+	nacquires = 0;
			
 
				 	for (index = 0; index < nbuffers; index++)
			
 
				 	{
			
 
				 		int ret;
			
@@ -976,13 +1005,33 @@ int _starpu_fetch_task_input(struct _starpu_job *j)
 
				 
			
 
				 		local_replicate = get_replicate(handle, mode, workerid, node);
			
 
				 
			
 
				-		ret = fetch_data(handle, node, local_replicate, mode, 0);
			
 
				-		if (STARPU_UNLIKELY(ret))
			
 
				-			goto enomem;
			
 
				+		if (async)
			
 
				+		{
			
 
				+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
			
 
				+					_starpu_fetch_task_input_cb, worker, 0, "_starpu_src_common_worker_internal_work");
			
 
				+			if (STARPU_UNLIKELY(ret))
			
 
				+			{
			
 
				+				/* Ooops, not enough memory, make worker wait for these for now, and the synchronous call will finish by forcing eviction*/
			
 
				+				worker->nb_buffers_totransfer = nacquires;
			
 
				+				return 0;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			ret = fetch_data(handle, node, local_replicate, mode, 0);
			
 
				+			if (STARPU_UNLIKELY(ret))
			
 
				+				goto enomem;
			
 
				+		}
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		total_size += _starpu_data_get_size(handle);
			
 
				 #endif
			
 
				+		nacquires++;
			
 
				+	}
			
 
				+	if (async)
			
 
				+	{
			
 
				+		worker->nb_buffers_totransfer = nacquires;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_LOAD(workerid,total_size);
			
@@ -1043,6 +1092,53 @@ enomem:
 
				 	return -1;
			
 
				 }
			
 
				 
			
 
				+/* This is to be called after having called _starpu_fetch_task_input with async=1 and getting the cb called as many times as there are buffers.  */
			
 
				+int _starpu_release_fetch_task_input_async(struct _starpu_job *j, int workerid, int nbtransfers)
			
 
				+{
			
 
				+	STARPU_RMB();
			
 
				+	_STARPU_TRACE_WORKER_END_FETCH_INPUT(NULL, workerid);
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
			
 
				+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				+	unsigned local_memory_node = _starpu_memory_node_get_local_key();
			
 
				+	unsigned index;
			
 
				+	unsigned nreleases;
			
 
				+
			
 
				+	nreleases = 0;
			
 
				+	for (index = 0; index < nbuffers; index++)
			
 
				+	{
			
 
				+		if (nreleases == nbtransfers)
			
 
				+			/* That was a partial fetch */
			
 
				+			break;
			
 
				+		starpu_data_handle_t handle = descrs[index].handle;
			
 
				+		enum starpu_data_access_mode mode = descrs[index].mode;
			
 
				+		int node = descrs[index].node;
			
 
				+		if (node == -1)
			
 
				+			node = local_memory_node;
			
 
				+
			
 
				+		struct _starpu_data_replicate *local_replicate;
			
 
				+
			
 
				+		if (index && descrs[index-1].handle == descrs[index].handle)
			
 
				+			/* We have already took this data, skip it. This
			
 
				+			 * depends on ordering putting writes before reads, see
			
 
				+			 * _starpu_compar_handles */
			
 
				+			continue;
			
 
				+
			
 
				+		local_replicate = get_replicate(handle, mode, workerid, node);
			
 
				+
			
 
				+		/* Release our refcnt */
			
 
				+		_starpu_spin_lock(&handle->header_lock);
			
 
				+		local_replicate->refcnt--;
			
 
				+		STARPU_ASSERT(local_replicate->refcnt >= 0);
			
 
				+		STARPU_ASSERT(handle->busy_count > 0);
			
 
				+		handle->busy_count--;
			
 
				+		if (!_starpu_data_check_not_busy(handle))
			
 
				+			_starpu_spin_unlock(&handle->header_lock);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /* Release task data dependencies */
			
 
				 void __starpu_push_task_output(struct _starpu_job *j)
			
 
				 {
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -301,7 +301,8 @@ void _starpu_push_task_output(struct _starpu_job *j);
 
				 void _starpu_release_nowhere_task_output(struct _starpu_job *j);
			
 
				 
			
 
				 STARPU_ATTRIBUTE_WARN_UNUSED_RESULT
			
 
				-int _starpu_fetch_task_input(struct _starpu_job *j);
			
 
				+int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async);
			
 
				+int _starpu_release_fetch_task_input_async(struct _starpu_job *j, int workerid, int nbtransfers);
			
 
				 void _starpu_fetch_nowhere_task_input(struct _starpu_job *j);
			
 
				 
			
 
				 unsigned _starpu_is_data_present_or_requested(struct _starpu_data_state *state, unsigned node);
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -69,7 +69,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 
			
 
				 	if (rank == 0 && !continuation_wake_up)
			
 
				 	{
			
 
				-		int ret = _starpu_fetch_task_input(j);
			
 
				+		int ret = _starpu_fetch_task_input(task, j, 0);
			
 
				 		if (ret != 0)
			
 
				 		{
			
 
				 			/* there was not enough memory so the codelet cannot be executed right now ... */
			
@@ -229,56 +229,20 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
			
 
				+static int _starpu_cpu_driver_execute_task(struct _starpu_worker *cpu_worker, struct starpu_task *task, struct _starpu_job *j)
			
 
				 {
			
 
				-	unsigned memnode = cpu_worker->memory_node;
			
 
				-	int workerid = cpu_worker->workerid;
			
 
				-
			
 
				 	int res;
			
 
				 
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	starpu_pthread_wait_reset(&cpu_worker->wait);
			
 
				-#endif
			
 
				-
			
 
				-	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				-	res = __starpu_datawizard_progress(1, 1);
			
 
				-	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				-
			
 
				-	struct _starpu_job *j;
			
 
				-	struct starpu_task *task;
			
 
				-
			
 
				-	task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
			
 
				-
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	if (!res && !task)
			
 
				-		starpu_pthread_wait_wait(&cpu_worker->wait);
			
 
				-#endif
			
 
				-
			
 
				-	if (!task)
			
 
				-		return 0;
			
 
				-
			
 
				-	j = _starpu_get_job_associated_to_task(task);
			
 
				-
			
 
				-	/* can a cpu perform that task ? */
			
 
				-	if (!_STARPU_CPU_MAY_PERFORM(j))
			
 
				-	{
			
 
				-		/* put it and the end of the queue ... XXX */
			
 
				-		_starpu_push_task_to_workers(task);
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	int rank = 0;
			
 
				+	int rank;
			
 
				 	int is_parallel_task = (j->task_size > 1);
			
 
				 
			
 
				 	struct starpu_perfmodel_arch* perf_arch;
			
 
				 
			
 
				+	rank = cpu_worker->current_rank;
			
 
				+
			
 
				 	/* Get the rank in case it is a parallel task */
			
 
				 	if (is_parallel_task)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				-		rank = j->active_task_alias_count++;
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				-
			
 
				 		if(j->combined_workerid != -1)
			
 
				 		{
			
 
				 			struct _starpu_combined_worker *combined_worker;
			
@@ -286,7 +250,6 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 			
			
 
				 			cpu_worker->combined_workerid = j->combined_workerid;
			
 
				 			cpu_worker->worker_size = combined_worker->worker_size;
			
 
				-			cpu_worker->current_rank = rank;
			
 
				 			perf_arch = &combined_worker->perf_arch;
			
 
				 		}
			
 
				 		else
			
@@ -301,7 +264,6 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 	{
			
 
				 		cpu_worker->combined_workerid = cpu_worker->workerid;
			
 
				 		cpu_worker->worker_size = 1;
			
 
				-		cpu_worker->current_rank = 0;
			
 
				 
			
 
				 		struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_worker, j);
			
 
				 		if (sched_ctx && !sched_ctx->sched_policy && !sched_ctx->awake_workers && sched_ctx->main_master == cpu_worker->workerid)
			
@@ -344,6 +306,85 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
			
 
				+{
			
 
				+	unsigned memnode = cpu_worker->memory_node;
			
 
				+	int workerid = cpu_worker->workerid;
			
 
				+
			
 
				+	int res;
			
 
				+
			
 
				+	struct _starpu_job *j;
			
 
				+	struct starpu_task *task = NULL, *pending_task;
			
 
				+
			
 
				+	int rank = 0;
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	starpu_pthread_wait_reset(&cpu_worker->wait);
			
 
				+#endif
			
 
				+
			
 
				+	/* Test if async transfers are completed */
			
 
				+	pending_task = cpu_worker->task_transferring;
			
 
				+	if (pending_task != NULL && cpu_worker->nb_buffers_transferred == cpu_worker->nb_buffers_totransfer)
			
 
				+	{
			
 
				+		struct _starpu_job *j = _starpu_get_job_associated_to_task(pending_task);
			
 
				+
			
 
				+		STARPU_RMB();
			
 
				+		_starpu_release_fetch_task_input_async(j, workerid, cpu_worker->nb_buffers_totransfer);
			
 
				+		/* Reset it */
			
 
				+		cpu_worker->task_transferring = NULL;
			
 
				+
			
 
				+		return _starpu_cpu_driver_execute_task(cpu_worker, pending_task, j);
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+	res = __starpu_datawizard_progress(1, 1);
			
 
				+	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+
			
 
				+	if (!pending_task)
			
 
				+		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	if (!res && !task)
			
 
				+		/* No progress, wait */
			
 
				+		starpu_pthread_wait_wait(&cpu_worker->wait);
			
 
				+#endif
			
 
				+
			
 
				+	if (!task)
			
 
				+		/* No task or task still pending transfers */
			
 
				+		return 0;
			
 
				+
			
 
				+	j = _starpu_get_job_associated_to_task(task);
			
 
				+	/* NOTE: j->task is != task for parallel tasks, which share the same
			
 
				+	 * job. */
			
 
				+
			
 
				+	/* can a cpu perform that task ? */
			
 
				+	if (!_STARPU_CPU_MAY_PERFORM(j))
			
 
				+	{
			
 
				+		/* put it and the end of the queue ... XXX */
			
 
				+		_starpu_push_task_to_workers(task);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* Get the rank in case it is a parallel task */
			
 
				+	if (j->task_size > 1)
			
 
				+	{
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				+		rank = j->active_task_alias_count++;
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		rank = 0;
			
 
				+	}
			
 
				+	cpu_worker->current_rank = rank;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+		_starpu_fetch_task_input(task, j, 1);
			
 
				+	else
			
 
				+		return _starpu_cpu_driver_execute_task(cpu_worker, task, j);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 int _starpu_cpu_driver_deinit(struct _starpu_worker *cpu_worker)
			
 
				 {
			
 
				 	_STARPU_TRACE_WORKER_DEINIT_START;
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -474,8 +474,8 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 
			
 
				 	_starpu_set_current_task(task);
			
 
				 
			
 
				-	ret = _starpu_fetch_task_input(j);
			
 
				-	if (ret != 0)
			
 
				+	ret = _starpu_fetch_task_input(task, j, 0);
			
 
				+	if (ret < 0)
			
 
				 	{
			
 
				 		/* there was not enough memory, so the input of
			
 
				 		 * the codelet cannot be fetched ... put the
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -489,11 +489,11 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 		{
			
 
				 			tasks[i] = NULL;
			
 
				 		}
			
 
				-                /* don't push a task if we are already pushing one */
			
 
				-                else if (workers[i].task_sending != NULL)
			
 
				-                {
			
 
				-                        tasks[i] = NULL;
			
 
				-                }
			
 
				+		/* don't push a task if we are already transferring one */
			
 
				+		else if (workers[i].task_transferring != NULL)
			
 
				+		{
			
 
				+			tasks[i] = NULL;
			
 
				+		}
			
 
				 		/*else try to pop a task*/
			
 
				 		else
			
 
				 		{
			
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -253,9 +253,9 @@ static void gordon_callback_func(void *arg)
 
				 int inject_task(struct _starpu_job *j, struct _starpu_worker *worker)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				-	int ret = _starpu_fetch_task_input(j);
			
 
				+	int ret = _starpu_fetch_task_input(task, j, 0);
			
 
				 
			
 
				-	if (ret != 0)
			
 
				+	if (ret < 0)
			
 
				 	{
			
 
				 		/* there was not enough memory so the codelet cannot be executed right now ... */
			
 
				 		/* push the codelet back and try another one ... */
			
@@ -316,8 +316,8 @@ int inject_task_list(struct _starpu_job_list *list, struct _starpu_worker *worke
 
				 		int ret;
			
 
				 
			
 
				 		struct starpu_task *task = j->task;
			
 
				-		ret = _starpu_fetch_task_input(j);
			
 
				-		STARPU_ASSERT(!ret);
			
 
				+		ret = _starpu_fetch_task_input(task, j, 0);
			
 
				+		STARPU_ASSERT(ret >= 0);
			
 
				 
			
 
				 		_starpu_sched_pre_exec_hook(task);
			
 
				 
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -492,7 +492,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 
				 	STARPU_ASSERT(task);
			
 
				 	if (worker->current_rank == 0)
			
 
				 	{
			
 
				-		int ret = _starpu_fetch_task_input(j);
			
 
				+		int ret = _starpu_fetch_task_input(task, j, 0);
			
 
				 		if (ret != 0)
			
 
				 		{
			
 
				 			/* there was not enough memory, so the input of
			
@@ -936,17 +936,6 @@ static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int b
 
				         STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				 }
			
 
				 
			
 
				-/* Callback used when a buffer is send asynchronously to the sink */
			
 
				-static void _starpu_src_common_send_data_callback(void *arg)
			
 
				-{
			
 
				-        struct _starpu_worker * worker = (struct _starpu_worker *) arg;
			
 
				-
			
 
				-        /* increase the number of buffer received */
			
 
				-        STARPU_WMB();
			
 
				-        (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_sent, 1);
			
 
				-}
			
 
				-
			
 
				-
			
 
				 static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set * worker_set, struct _starpu_mp_node * mp_node, struct starpu_task **tasks, unsigned memnode)
			
 
				 {
			
 
				         int res = 0;
			
@@ -958,56 +947,37 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 
				 #endif
			
 
				 
			
 
				 
			
 
				-        /* Test if async transfers are completed */
			
 
				-        for (unsigned i = 0; i < worker_set->nworkers; i++)
			
 
				+    /* Test if async transfers are completed */
			
 
				+    for (unsigned i = 0; i < worker_set->nworkers; i++)
			
 
				+    {
			
 
				+        struct starpu_task *task = worker_set->workers[i].task_transferring;
			
 
				+        /* We send all buffers to execute the task */
			
 
				+        if (task != NULL && worker_set->workers[i].nb_buffers_transferred == worker_set->workers[i].nb_buffers_totransfer)
			
 
				         {
			
 
				-                /* We send all buffers to execute the task */
			
 
				-                if (worker_set->workers[i].task_sending != NULL && worker_set->workers[i].nb_buffers_sent == STARPU_TASK_GET_NBUFFERS(worker_set->workers[i].task_sending))
			
 
				-                {
			
 
				-                        int workerid = worker_set->workers[i].workerid;
			
 
				-
			
 
				-                        STARPU_RMB();
			
 
				-                        _STARPU_TRACE_WORKER_END_FETCH_INPUT(NULL, workerid);
			
 
				-
			
 
				-                        unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(worker_set->workers[i].task_sending);
			
 
				-                        unsigned buf;
			
 
				-                        for (buf = 0; buf < nbuffers; buf++)
			
 
				-                        {
			
 
				-                                starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(worker_set->workers[i].task_sending, buf);
			
 
				-                                struct _starpu_data_replicate *replicate = &handle->per_node[memnode];
			
 
				-                                /* Release our refcnt */
			
 
				-                                _starpu_spin_lock(&handle->header_lock);
			
 
				-                                replicate->refcnt--;
			
 
				-                                STARPU_ASSERT(replicate->refcnt >= 0);
			
 
				-                                STARPU_ASSERT(handle->busy_count > 0);
			
 
				-                                handle->busy_count--;
			
 
				-                                if (!_starpu_data_check_not_busy(handle))
			
 
				-                                        _starpu_spin_unlock(&handle->header_lock);
			
 
				-                        }
			
 
				-
			
 
				-                        /* Execute the task */
			
 
				-                        struct _starpu_job * j = _starpu_get_job_associated_to_task(worker_set->workers[i].task_sending);
			
 
				-                        _starpu_set_local_worker_key(&worker_set->workers[i]);
			
 
				-                        res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
			
 
				-                        switch (res)
			
 
				-                        {
			
 
				-                                case 0:
			
 
				-                                        /* The task task has been launched with no error */
			
 
				-                                        break;
			
 
				-                                case -EAGAIN:
			
 
				-                                        _STARPU_DISP("ouch, this MP worker could not actually run task %p, putting it back...\n", tasks[i]);
			
 
				-                                        _starpu_push_task_to_workers(worker_set->workers[i].task_sending);
			
 
				-                                        STARPU_ABORT();
			
 
				-                                        continue;
			
 
				-                                        break;
			
 
				-                                default:
			
 
				-                                        STARPU_ASSERT(0);
			
 
				-                        }
			
 
				+            struct _starpu_job * j = _starpu_get_job_associated_to_task(task);
			
 
				+
			
 
				+            _starpu_set_local_worker_key(&worker_set->workers[i]);
			
 
				+            _starpu_release_fetch_task_input_async(j, worker_set->workers[i].nb_buffers_totransfer);
			
 
				+
			
 
				+            /* Execute the task */
			
 
				+            res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
			
 
				+            switch (res)
			
 
				+            {
			
 
				+                case 0:
			
 
				+                    /* The task task has been launched with no error */
			
 
				+                    break;
			
 
				+                case -EAGAIN:
			
 
				+                    _STARPU_DISP("ouch, this MP worker could not actually run task %p, putting it back...\n", tasks[i]);
			
 
				+                    _starpu_push_task_to_workers(worker_set->workers[i].task_transferring);
			
 
				+                    STARPU_ABORT();
			
 
				+                    continue;
			
 
				+                    break;
			
 
				+                default:
			
 
				+                    STARPU_ASSERT(0);
			
 
				+            }
			
 
				 
			
 
				-                        /* Reset it */
			
 
				-                        worker_set->workers[i].task_sending = NULL;
			
 
				-                        worker_set->workers[i].nb_buffers_sent = 0;
			
 
				-                }
			
 
				+            /* Reset it */
			
 
				+            worker_set->workers[i].task_transferring = NULL;
			
 
				         }
			
 
				 
			
 
				         _STARPU_TRACE_START_PROGRESS(memnode);
			
@@ -1037,31 +1007,17 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 
				                 starpu_pthread_wait_wait(&worker_set->workers[0].wait);
			
 
				 #endif
			
 
				 
			
 
				-        /*if at least one worker have pop a task*/
			
 
				-        if(res != 0)
			
 
				+    /*if at least one worker have pop a task*/
			
 
				+    if(res != 0)
			
 
				+    {
			
 
				+        unsigned i;
			
 
				+        for(i=0; i<worker_set->nworkers; i++)
			
 
				         {
			
 
				-                unsigned i, buf;
			
 
				-                for(i=0; i<worker_set->nworkers; i++)
			
 
				-                {
			
 
				-                        if(tasks[i] != NULL)
			
 
				-                        {
			
 
				-                                int workerid = worker_set->workers[i].workerid;
			
 
				-                                _STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
			
 
				-                                unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(tasks[i]);
			
 
				-
			
 
				-                                for (buf = 0; buf < nbuffers; buf++)
			
 
				-                                {
			
 
				-                                        starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(tasks[i], buf);
			
 
				-                                        enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(tasks[i], buf);
			
 
				-                                        struct _starpu_data_replicate *local_replicate = get_replicate(handle, mode, workerid, memnode);
			
 
				-
			
 
				-                                        int ret = _starpu_fetch_data_on_node(handle, memnode, local_replicate, mode, 0, 0, 1,
			
 
				-                                                        _starpu_src_common_send_data_callback, &worker_set->workers[i], 0, "_starpu_src_common_worker_internal_work");
			
 
				-                                        STARPU_ASSERT(!ret);
			
 
				-                                }
			
 
				-                                worker_set->workers[i].task_sending = tasks[i];
			
 
				-                        }
			
 
				-                }
			
 
				+            if(tasks[i] != NULL)
			
 
				+            {
			
 
				+                _starpu_set_local_worker_key(&worker_set->workers[i]);
			
 
				+                _starpu_fetch_task_input(task[i], _starpu_get_job_associated_to_task(tasks[i]), 1);
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         /* Handle message which have been store */
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -904,10 +904,10 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 	struct starpu_codelet *cl = task->cl;
			
 
				 	STARPU_ASSERT(cl);
			
 
				 
			
 
				-	_starpu_set_current_task(j->task);
			
 
				+	_starpu_set_current_task(task);
			
 
				 
			
 
				-	ret = _starpu_fetch_task_input(j);
			
 
				-	if (ret != 0)
			
 
				+	ret = _starpu_fetch_task_input(task, j, 0);
			
 
				+	if (ret < 0)
			
 
				 	{
			
 
				 		/* there was not enough memory, so the input of
			
 
				 		 * the codelet cannot be fetched ... put the