Browse Source

add an option to CUDA workers do not to do slow allocations on other nodes

This creates the option STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES. When 1, CUDA workers will not perform slow allocations (RAM and pinned). This also changes the behavior of may_alloc in some functions, adding the parameter 2, can do allocations but only fast ones.
Lucas Nesi 4 years ago
parent
commit
42a8d55622

+ 10 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -473,6 +473,16 @@ todo
 todo
 </dd>
 
+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
+<dd>
+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+Specify if CUDA workers should do only fast allocations
+when running the datawizard progress of
+other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
+Default value is 0, allowing CUDA workers to do slow allocations.
+</dd>
+
 </dl>
 
 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine

+ 8 - 0
include/starpu.h

@@ -471,6 +471,14 @@ struct starpu_conf
 	   Maximum spinning backoff of drivers. Default value: \c 32
 	 */
 	unsigned driver_spinning_backoff_max;
+
+	/**
+	   Specify if CUDA workers should do only fast allocations
+	   when running the datawizard progress of
+	   other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
+	   Default value is 0, allowing CUDA workers to do slow allocations.
+	 */
+	int cuda_only_fast_alloc_other_memnodes;
 };
 
 /**

+ 1 - 1
src/core/dependencies/data_arbiter_concurrency.c

@@ -286,7 +286,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 2 - 2
src/core/dependencies/data_concurrency.c

@@ -132,7 +132,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);
@@ -266,7 +266,7 @@ static void _starpu_take_data(unsigned request_from_codelet,
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 2 - 0
src/core/workers.c

@@ -1168,6 +1168,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 	/* Do not start performance counter collection by default */
 	conf->start_perf_counter_collection = 0;
+
+	conf->cuda_only_fast_alloc_other_memnodes = starpu_get_env_number_default("STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES", 0);
 	return 0;
 }
 

+ 3 - 3
src/datawizard/coherency.c

@@ -601,7 +601,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		{
 			/* And this is the main RAM without pinning, really no need for a
 			 * request, just quickly allocate and be done */
-			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch, 0) == 0)
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);
 				if (dst_replicate->mc)
@@ -778,7 +778,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(1);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -918,7 +918,7 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(1);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);

+ 3 - 3
src/datawizard/copy_driver.c

@@ -200,7 +200,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 									struct _starpu_data_replicate *dst_replicate,
 									unsigned donotread,
 									struct _starpu_data_request *req,
-									unsigned may_alloc,
+									enum _starpu_may_alloc may_alloc,
 									enum starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
 {
 	if (!donotread)
@@ -215,11 +215,11 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 	/* first make sure the destination has an allocated buffer */
 	if (!dst_replicate->allocated)
 	{
-		if (!may_alloc || _starpu_is_reclaiming(dst_node))
+		if (may_alloc==STARPU_DATAWIZARD_DO_NOT_ALLOC || _starpu_is_reclaiming(dst_node))
 			/* We're not supposed to allocate there at the moment */
 			return -ENOMEM;
 
-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch);
+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch, may_alloc==STARPU_DATAWIZARD_ONLY_FAST_ALLOC);
 		if (ret_alloc)
 			return -ENOMEM;
 	}

+ 8 - 1
src/datawizard/copy_driver.h

@@ -47,6 +47,13 @@ extern "C"
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 
+enum _starpu_may_alloc
+{
+	STARPU_DATAWIZARD_DO_NOT_ALLOC,
+	STARPU_DATAWIZARD_DO_ALLOC,
+	STARPU_DATAWIZARD_ONLY_FAST_ALLOC
+};
+
 #ifdef STARPU_USE_MIC
 /** MIC needs memory_node to know which MIC is concerned.
  * mark is used to wait asynchronous request.
@@ -131,7 +138,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 				    struct _starpu_data_replicate *dst_replicate,
 				    unsigned donotread,
 				    struct _starpu_data_request *req,
-				    unsigned may_alloc,
+				    enum _starpu_may_alloc may_alloc,
 				    enum starpu_is_prefetch prefetch);
 
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);

+ 6 - 6
src/datawizard/data_request.c

@@ -260,7 +260,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	return r;
 }
 
-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
 {
 	int retval;
 	int do_delete = 0;
@@ -541,7 +541,7 @@ void _starpu_data_request_complete_wait(void *arg)
 }
 
 /* TODO : accounting to see how much time was spent working for other people ... */
-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc)
+static int starpu_handle_data_request(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
 {
 	starpu_data_handle_t handle = r->handle;
 
@@ -663,7 +663,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 }
 
-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
 {
 	struct _starpu_data_request *r;
 	unsigned i;
@@ -793,17 +793,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 	return ret;
 }
 
-int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
 	return __starpu_handle_node_data_requests(data_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
 }
 
-int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
 	return __starpu_handle_node_data_requests(prefetch_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
 }
 
-int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
 	return __starpu_handle_node_data_requests(idle_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
 }

+ 5 - 5
src/datawizard/data_request.h

@@ -143,7 +143,7 @@ LIST_TYPE(_starpu_data_requester,
 
 	int prio;
 
-	/** if this is more complicated ... (eg. application request) 
+	/** if this is more complicated ... (eg. application request)
 	 * NB: this callback is not called with the lock taken !
 	 */
 	void (*ready_data_callback)(void *argcb);
@@ -155,9 +155,9 @@ void _starpu_init_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_post_data_request(struct _starpu_data_request *r);
 /** returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
-int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed);
-int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed);
-int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed);
+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
 
 int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
 int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
@@ -177,7 +177,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 unsigned is_write_invalidation,
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
 
-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc);
 
 void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 					  void (*callback_func)(void *),

+ 15 - 10
src/datawizard/datawizard.c

@@ -26,7 +26,7 @@
 #include <core/simgrid.h>
 #endif
 
-static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, unsigned may_alloc, unsigned push_requests)
+static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned push_requests)
 {
 	int ret = 0;
 	unsigned peer_node;
@@ -86,7 +86,7 @@ static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_st
 	return ret;
 }
 
-static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, unsigned may_alloc, unsigned push_requests)
+static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, enum _starpu_may_alloc may_alloc, unsigned push_requests)
 {
 	int ret = 0;
 	unsigned peer_node;
@@ -107,7 +107,7 @@ static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes,
 	return ret;
 }
 
-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests)
 {
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
         unsigned memnode;
@@ -130,22 +130,27 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 		worker = &worker->set->workers[0];
 
 	unsigned current_worker_id = worker->workerid;
-        int ret = 0;
+	int ret = 0;
 	unsigned nnodes = starpu_memory_nodes_get_count();
 
 	if (nnodes > 1)
-        for (memnode = 0; memnode < nnodes; memnode++)
-        {
-                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
-			ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
-        }
+		for (memnode = 0; memnode < nnodes; memnode++)
+		{
+			if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
+			{
+				if(_starpu_config.conf.cuda_only_fast_alloc_other_memnodes && worker->arch == STARPU_CUDA_WORKER && worker->memory_node != memnode)
+					ret |=  ___starpu_datawizard_progress(memnode, nnodes, STARPU_DATAWIZARD_ONLY_FAST_ALLOC, push_requests);
+				else
+					ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
+				}
+		}
 
 	_starpu_execute_registered_progression_hooks();
 
         return ret;
 }
 
-void _starpu_datawizard_progress(unsigned may_alloc)
+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc)
 {
         __starpu_datawizard_progress(may_alloc, 1);
 }

+ 4 - 3
src/datawizard/datawizard.h

@@ -34,16 +34,17 @@
 
 #include <core/dependencies/implicit_data_deps.h>
 
+
 /** Make data transfers progress on all memory nodes driven by the current worker.
  *
  * If \p push_requests is 1, it can start new transfers
  *
- * If \p may_alloc is 1, it can allocate destination data for transfers
+ * If \p may_alloc is STARPU_DATAWIZARD_DO_ALLOC, it can allocate destination data for transfers
  * (this is not possible e.g. when spinning for a handle lock)
  */
-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests);
 /** Call __starpu_datawizard_progress with push_requests = 1 */
-void _starpu_datawizard_progress(unsigned may_alloc);
+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc);
 
 /** Check for all pending data request progress on node \p memory_node */
 void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode);

+ 1 - 1
src/datawizard/filters.c

@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		int home_node = initial_handle->home_node;
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 			home_node = STARPU_MAIN_RAM;
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH, 0);
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #endif

+ 12 - 6
src/datawizard/memalloc.c

@@ -1446,7 +1446,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
  *
  */
 
-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch)
+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
 {
 	unsigned attempts = 0;
 	starpu_ssize_t allocated_memory;
@@ -1477,6 +1477,12 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	if (!prefetch_oom)
 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
 #endif
+
+	/* If this is RAM and pinned this will be slow
+	   In case we only want fast allocations return here */
+	if(only_fast_alloc && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM && _starpu_malloc_willpin_on_node(dst_node))
+		return -ENOMEM;
+
 	STARPU_ASSERT(handle->ops);
 	STARPU_ASSERT(handle->ops->allocate_data_on_node);
 	STARPU_ASSERT(replicate->data_interface);
@@ -1580,7 +1586,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(0);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -1624,7 +1630,7 @@ out:
 	return allocated_memory;
 }
 
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch)
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
 {
 	starpu_ssize_t allocated_memory;
 
@@ -1639,7 +1645,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 		return 0;
 
 	STARPU_ASSERT(replicate->data_interface);
-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch, only_fast_alloc);
 
 	/* perhaps we could really not handle that capacity misses */
 	if (allocated_memory == -ENOMEM)
@@ -1849,7 +1855,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			for (i=0; i<nb_numa_nodes; i++)
 			{
-				if (handle->per_node[i].allocated || 
+				if (handle->per_node[i].allocated ||
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				{
 					target = i;
@@ -1881,7 +1887,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			for (i=0; i<nb_numa_nodes; i++)
 			{
-				if (handle->per_node[i].allocated || 
+				if (handle->per_node[i].allocated ||
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				{
 					target = i;

+ 1 - 1
src/datawizard/memalloc.h

@@ -83,7 +83,7 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch);
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);

+ 1 - 1
src/datawizard/write_back.c

@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 				{
 					cpt++;
-					__starpu_datawizard_progress(1, 1);
+					__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 				}
 				if (cpt == STARPU_SPIN_MAXTRY)
 					_starpu_spin_lock(&handle->header_lock);

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -342,7 +342,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 		return ret;
 	}
 
-	res = __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 	if (!pending_task)
 		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);

+ 2 - 2
src/drivers/cuda/driver_cuda.c

@@ -936,13 +936,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	if (!idle_tasks)
 	{
 		/* No task ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(1, !idle_transfers);
+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
 		return 0;
 	}
 #endif
 
 	/* Something done, make some progress */
-	res = __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 	/* And pull tasks */
 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);

+ 1 - 1
src/drivers/mp_common/source_common.c

@@ -978,7 +978,7 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 		}
 	}
 
-        res |= __starpu_datawizard_progress(1, 1);
+        res |= __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
         /* Handle message which have been store */
         _starpu_src_common_handle_stored_async(mp_node);

+ 2 - 2
src/drivers/opencl/driver_opencl.c

@@ -788,12 +788,12 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 	if (!idle_tasks)
 	{
 		/* No task ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(1, !idle_transfers);
+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
 		return 0;
 	}
 #endif
 
-	res = __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 	task = _starpu_get_worker_task(worker, workerid, memnode);