лет назад: 4 · 42a8d55622
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -473,6 +473,16 @@ todo
 
				 todo
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				+Specify if CUDA workers should do only fast allocations
			
 
				+when running the datawizard progress of
			
 
				+other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
			
 
				+Default value is 0, allowing CUDA workers to do slow allocations.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -471,6 +471,14 @@ struct starpu_conf
 
				 	   Maximum spinning backoff of drivers. Default value: \c 32
			
 
				 	 */
			
 
				 	unsigned driver_spinning_backoff_max;
			
 
				+
			
 
				+	/**
			
 
				+	   Specify if CUDA workers should do only fast allocations
			
 
				+	   when running the datawizard progress of
			
 
				+	   other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
			
 
				+	   Default value is 0, allowing CUDA workers to do slow allocations.
			
 
				+	 */
			
 
				+	int cuda_only_fast_alloc_other_memnodes;
			
 
				 };
			
 
				 
			
 
				 /**
			
--- a/src/core/dependencies/data_arbiter_concurrency.c
+++ b/src/core/dependencies/data_arbiter_concurrency.c
@@ -286,7 +286,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(0);
			
 
				+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -132,7 +132,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(0);
			
 
				+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
@@ -266,7 +266,7 @@ static void _starpu_take_data(unsigned request_from_codelet,
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(0);
			
 
				+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1168,6 +1168,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 
			
 
				 	/* Do not start performance counter collection by default */
			
 
				 	conf->start_perf_counter_collection = 0;
			
 
				+
			
 
				+	conf->cuda_only_fast_alloc_other_memnodes = starpu_get_env_number_default("STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES", 0);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -601,7 +601,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 		{
			
 
				 			/* And this is the main RAM without pinning, really no need for a
			
 
				 			 * request, just quickly allocate and be done */
			
 
				-			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
			
 
				+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch, 0) == 0)
			
 
				 			{
			
 
				 				_starpu_update_data_state(handle, dst_replicate, mode);
			
 
				 				if (dst_replicate->mc)
			
@@ -778,7 +778,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(1);
			
 
				+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -918,7 +918,7 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(1);
			
 
				+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -200,7 +200,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 									struct _starpu_data_replicate *dst_replicate,
			
 
				 									unsigned donotread,
			
 
				 									struct _starpu_data_request *req,
			
 
				-									unsigned may_alloc,
			
 
				+									enum _starpu_may_alloc may_alloc,
			
 
				 									enum starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	if (!donotread)
			
@@ -215,11 +215,11 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	/* first make sure the destination has an allocated buffer */
			
 
				 	if (!dst_replicate->allocated)
			
 
				 	{
			
 
				-		if (!may_alloc || _starpu_is_reclaiming(dst_node))
			
 
				+		if (may_alloc==STARPU_DATAWIZARD_DO_NOT_ALLOC || _starpu_is_reclaiming(dst_node))
			
 
				 			/* We're not supposed to allocate there at the moment */
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch);
			
 
				+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch, may_alloc==STARPU_DATAWIZARD_ONLY_FAST_ALLOC);
			
 
				 		if (ret_alloc)
			
 
				 			return -ENOMEM;
			
 
				 	}
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -47,6 +47,13 @@ extern "C"
 
				 struct _starpu_data_request;
			
 
				 struct _starpu_data_replicate;
			
 
				 
			
 
				+enum _starpu_may_alloc
			
 
				+{
			
 
				+	STARPU_DATAWIZARD_DO_NOT_ALLOC,
			
 
				+	STARPU_DATAWIZARD_DO_ALLOC,
			
 
				+	STARPU_DATAWIZARD_ONLY_FAST_ALLOC
			
 
				+};
			
 
				+
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 /** MIC needs memory_node to know which MIC is concerned.
			
 
				  * mark is used to wait asynchronous request.
			
@@ -131,7 +138,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 
				 				    struct _starpu_data_replicate *dst_replicate,
			
 
				 				    unsigned donotread,
			
 
				 				    struct _starpu_data_request *req,
			
 
				-				    unsigned may_alloc,
			
 
				+				    enum _starpu_may_alloc may_alloc,
			
 
				 				    enum starpu_is_prefetch prefetch);
			
 
				 
			
 
				 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -260,7 +260,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
			
 
				+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
			
 
				 {
			
 
				 	int retval;
			
 
				 	int do_delete = 0;
			
@@ -541,7 +541,7 @@ void _starpu_data_request_complete_wait(void *arg)
 
				 }
			
 
				 
			
 
				 /* TODO : accounting to see how much time was spent working for other people ... */
			
 
				-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc)
			
 
				+static int starpu_handle_data_request(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
			
 
				 {
			
 
				 	starpu_data_handle_t handle = r->handle;
			
 
				 
			
@@ -663,7 +663,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
			
 
				+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	struct _starpu_data_request *r;
			
 
				 	unsigned i;
			
@@ -793,17 +793,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed)
			
 
				+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
			
 
				 {
			
 
				 	return __starpu_handle_node_data_requests(data_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed)
			
 
				+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
			
 
				 {
			
 
				 	return __starpu_handle_node_data_requests(prefetch_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed)
			
 
				+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
			
 
				 {
			
 
				 	return __starpu_handle_node_data_requests(idle_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
			
 
				 }
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -143,7 +143,7 @@ LIST_TYPE(_starpu_data_requester,
 
				 
			
 
				 	int prio;
			
 
				 
			
 
				-	/** if this is more complicated ... (eg. application request) 
			
 
				+	/** if this is more complicated ... (eg. application request)
			
 
				 	 * NB: this callback is not called with the lock taken !
			
 
				 	 */
			
 
				 	void (*ready_data_callback)(void *argcb);
			
@@ -155,9 +155,9 @@ void _starpu_init_data_request_lists(void);
 
				 void _starpu_deinit_data_request_lists(void);
			
 
				 void _starpu_post_data_request(struct _starpu_data_request *r);
			
 
				 /** returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
			
 
				-int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed);
			
 
				-int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed);
			
 
				-int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned may_alloc, unsigned *pushed);
			
 
				+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
			
 
				+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
			
 
				+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
			
 
				 
			
 
				 int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
			
 
				 int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
			
@@ -177,7 +177,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 							 unsigned is_write_invalidation,
			
 
				 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
			
 
				 
			
 
				-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
			
 
				+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc);
			
 
				 
			
 
				 void _starpu_data_request_append_callback(struct _starpu_data_request *r,
			
 
				 					  void (*callback_func)(void *),
			
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -26,7 +26,7 @@
 
				 #include <core/simgrid.h>
			
 
				 #endif
			
 
				 
			
 
				-static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, unsigned may_alloc, unsigned push_requests)
			
 
				+static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned push_requests)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				 	unsigned peer_node;
			
@@ -86,7 +86,7 @@ static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_st
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, unsigned may_alloc, unsigned push_requests)
			
 
				+static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, enum _starpu_may_alloc may_alloc, unsigned push_requests)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				 	unsigned peer_node;
			
@@ -107,7 +107,7 @@ static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
			
 
				+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests)
			
 
				 {
			
 
				 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         unsigned memnode;
			
@@ -130,22 +130,27 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
				 		worker = &worker->set->workers[0];
			
 
				 
			
 
				 	unsigned current_worker_id = worker->workerid;
			
 
				-        int ret = 0;
			
 
				+	int ret = 0;
			
 
				 	unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				 
			
 
				 	if (nnodes > 1)
			
 
				-        for (memnode = 0; memnode < nnodes; memnode++)
			
 
				-        {
			
 
				-                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
			
 
				-			ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
			
 
				-        }
			
 
				+		for (memnode = 0; memnode < nnodes; memnode++)
			
 
				+		{
			
 
				+			if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
			
 
				+			{
			
 
				+				if(_starpu_config.conf.cuda_only_fast_alloc_other_memnodes && worker->arch == STARPU_CUDA_WORKER && worker->memory_node != memnode)
			
 
				+					ret |=  ___starpu_datawizard_progress(memnode, nnodes, STARPU_DATAWIZARD_ONLY_FAST_ALLOC, push_requests);
			
 
				+				else
			
 
				+					ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
			
 
				+				}
			
 
				+		}
			
 
				 
			
 
				 	_starpu_execute_registered_progression_hooks();
			
 
				 
			
 
				         return ret;
			
 
				 }
			
 
				 
			
 
				-void _starpu_datawizard_progress(unsigned may_alloc)
			
 
				+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc)
			
 
				 {
			
 
				         __starpu_datawizard_progress(may_alloc, 1);
			
 
				 }
			
--- a/src/datawizard/datawizard.h
+++ b/src/datawizard/datawizard.h
@@ -34,16 +34,17 @@
 
				 
			
 
				 #include <core/dependencies/implicit_data_deps.h>
			
 
				 
			
 
				+
			
 
				 /** Make data transfers progress on all memory nodes driven by the current worker.
			
 
				  *
			
 
				  * If \p push_requests is 1, it can start new transfers
			
 
				  *
			
 
				- * If \p may_alloc is 1, it can allocate destination data for transfers
			
 
				+ * If \p may_alloc is STARPU_DATAWIZARD_DO_ALLOC, it can allocate destination data for transfers
			
 
				  * (this is not possible e.g. when spinning for a handle lock)
			
 
				  */
			
 
				-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
			
 
				+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests);
			
 
				 /** Call __starpu_datawizard_progress with push_requests = 1 */
			
 
				-void _starpu_datawizard_progress(unsigned may_alloc);
			
 
				+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc);
			
 
				 
			
 
				 /** Check for all pending data request progress on node \p memory_node */
			
 
				 void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode);
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
				 		int home_node = initial_handle->home_node;
			
 
				 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
			
 
				 			home_node = STARPU_MAIN_RAM;
			
 
				-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
			
 
				+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH, 0);
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning we should reclaim memory if allocation failed
			
 
				 #endif
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1446,7 +1446,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
				  *
			
 
				  */
			
 
				 
			
 
				-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch)
			
 
				+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
			
 
				 {
			
 
				 	unsigned attempts = 0;
			
 
				 	starpu_ssize_t allocated_memory;
			
@@ -1477,6 +1477,12 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 	if (!prefetch_oom)
			
 
				 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
			
 
				 #endif
			
 
				+
			
 
				+	/* If this is RAM and pinned this will be slow
			
 
				+	   In case we only want fast allocations return here */
			
 
				+	if(only_fast_alloc && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM && _starpu_malloc_willpin_on_node(dst_node))
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				 	STARPU_ASSERT(handle->ops);
			
 
				 	STARPU_ASSERT(handle->ops->allocate_data_on_node);
			
 
				 	STARPU_ASSERT(replicate->data_interface);
			
@@ -1580,7 +1586,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(0);
			
 
				+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -1624,7 +1630,7 @@ out:
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch)
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
			
 
				 {
			
 
				 	starpu_ssize_t allocated_memory;
			
 
				 
			
@@ -1639,7 +1645,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 
				 		return 0;
			
 
				 
			
 
				 	STARPU_ASSERT(replicate->data_interface);
			
 
				-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
			
 
				+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch, only_fast_alloc);
			
 
				 
			
 
				 	/* perhaps we could really not handle that capacity misses */
			
 
				 	if (allocated_memory == -ENOMEM)
			
@@ -1849,7 +1855,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
				 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				 			for (i=0; i<nb_numa_nodes; i++)
			
 
				 			{
			
 
				-				if (handle->per_node[i].allocated || 
			
 
				+				if (handle->per_node[i].allocated ||
			
 
				 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
			
 
				 				{
			
 
				 					target = i;
			
@@ -1881,7 +1887,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
				 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				 			for (i=0; i<nb_numa_nodes; i++)
			
 
				 			{
			
 
				-				if (handle->per_node[i].allocated || 
			
 
				+				if (handle->per_node[i].allocated ||
			
 
				 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
			
 
				 				{
			
 
				 					target = i;
			
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -83,7 +83,7 @@ void _starpu_init_mem_chunk_lists(void);
 
				 void _starpu_deinit_mem_chunk_lists(void);
			
 
				 void _starpu_mem_chunk_init_last(void);
			
 
				 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch);
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc);
			
 
				 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
			
 
				 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
			
 
				 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
				 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 				{
			
 
				 					cpt++;
			
 
				-					__starpu_datawizard_progress(1, 1);
			
 
				+					__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 				}
			
 
				 				if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 					_starpu_spin_lock(&handle->header_lock);
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -342,7 +342,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				-	res = __starpu_datawizard_progress(1, 1);
			
 
				+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				 	if (!pending_task)
			
 
				 		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -936,13 +936,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 	if (!idle_tasks)
			
 
				 	{
			
 
				 		/* No task ready yet, no better thing to do than waiting */
			
 
				-		__starpu_datawizard_progress(1, !idle_transfers);
			
 
				+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
			
 
				 		return 0;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	/* Something done, make some progress */
			
 
				-	res = __starpu_datawizard_progress(1, 1);
			
 
				+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				 	/* And pull tasks */
			
 
				 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -978,7 +978,7 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-        res |= __starpu_datawizard_progress(1, 1);
			
 
				+        res |= __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				         /* Handle message which have been store */
			
 
				         _starpu_src_common_handle_stored_async(mp_node);
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -788,12 +788,12 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 
				 	if (!idle_tasks)
			
 
				 	{
			
 
				 		/* No task ready yet, no better thing to do than waiting */
			
 
				-		__starpu_datawizard_progress(1, !idle_transfers);
			
 
				+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
			
 
				 		return 0;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	res = __starpu_datawizard_progress(1, 1);
			
 
				+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				 	task = _starpu_get_worker_task(worker, workerid, memnode);