Browse Source

merge datawizard+drivers

Andra Hugo 13 years ago
parent
commit
760bc799be
46 changed files with 4134 additions and 2098 deletions
  1. 245 135
      src/datawizard/coherency.c
  2. 106 64
      src/datawizard/coherency.h
  3. 123 97
      src/datawizard/copy_driver.c
  4. 18 16
      src/datawizard/copy_driver.h
  5. 256 98
      src/datawizard/data_request.c
  6. 34 28
      src/datawizard/data_request.h
  7. 87 18
      src/datawizard/datastats.c
  8. 12 2
      src/datawizard/datastats.h
  9. 104 50
      src/datawizard/filters.c
  10. 24 14
      src/datawizard/footprint.c
  11. 4 4
      src/datawizard/footprint.h
  12. 8 7
      src/datawizard/interfaces/bcsr_filters.c
  13. 97 88
      src/datawizard/interfaces/bcsr_interface.c
  14. 4 3
      src/datawizard/interfaces/block_filters.c
  15. 146 112
      src/datawizard/interfaces/block_interface.c
  16. 13 12
      src/datawizard/interfaces/csr_filters.c
  17. 136 96
      src/datawizard/interfaces/csr_interface.c
  18. 179 75
      src/datawizard/interfaces/data_interface.c
  19. 6 3
      src/datawizard/interfaces/data_interface.h
  20. 26 23
      src/datawizard/interfaces/matrix_filters.c
  21. 166 105
      src/datawizard/interfaces/matrix_interface.c
  22. 724 0
      src/datawizard/interfaces/multiformat_interface.c
  23. 101 78
      src/datawizard/interfaces/variable_interface.c
  24. 53 45
      src/datawizard/interfaces/vector_filters.c
  25. 111 87
      src/datawizard/interfaces/vector_interface.c
  26. 16 14
      src/datawizard/interfaces/void_interface.c
  27. 317 144
      src/datawizard/memalloc.c
  28. 16 8
      src/datawizard/memalloc.h
  29. 35 22
      src/datawizard/memory_nodes.c
  30. 20 24
      src/datawizard/memory_nodes.h
  31. 3 2
      src/datawizard/progress.c
  32. 155 82
      src/datawizard/reduction.c
  33. 33 12
      src/datawizard/sort_data_handles.c
  34. 1 1
      src/datawizard/sort_data_handles.h
  35. 143 99
      src/datawizard/user_interactions.c
  36. 32 16
      src/datawizard/write_back.c
  37. 2 2
      src/datawizard/write_back.h
  38. 57 59
      src/drivers/cpu/driver_cpu.c
  39. 98 67
      src/drivers/cuda/driver_cuda.c
  40. 35 28
      src/drivers/driver_common/driver_common.c
  41. 8 8
      src/drivers/driver_common/driver_common.h
  42. 107 90
      src/drivers/gordon/driver_gordon.c
  43. 143 97
      src/drivers/opencl/driver_opencl.c
  44. 3 17
      src/drivers/opencl/driver_opencl.h
  45. 125 44
      src/drivers/opencl/driver_opencl_utils.c
  46. 2 2
      src/drivers/opencl/driver_opencl_utils.h

+ 245 - 135
src/datawizard/coherency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,21 +21,27 @@
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
 #include <profiling/profiling.h>
+#include <math.h>
 
-uint32_t _starpu_select_src_node(starpu_data_handle handle)
+static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
+uint32_t _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 {
-	unsigned src_node = 0;
+	int src_node = -1;
 	unsigned i;
 
-	unsigned nnodes = _starpu_get_memory_nodes_count();
+	unsigned nnodes = starpu_memory_nodes_get_count();
 
 	/* first find a valid copy, either a STARPU_OWNER or a STARPU_SHARED */
 	uint32_t node;
 
 	uint32_t src_node_mask = 0;
+	size_t size = _starpu_data_get_size(handle);
+	double cost = INFINITY;
+
 	for (node = 0; node < nnodes; node++)
 	{
-		if (handle->per_node[node].state != STARPU_INVALID) {
+		if (handle->per_node[node].state != STARPU_INVALID)
+		{
 			/* we found a copy ! */
 			src_node_mask |= (1<<node);
 		}
@@ -44,7 +50,42 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 	/* we should have found at least one copy ! */
 	STARPU_ASSERT(src_node_mask != 0);
 
-	/* find the node that will be the actual source */
+	/* Without knowing the size, we won't know the cost */
+	if (!size)
+		cost = 0;
+
+	/* Check whether we have transfer cost for all nodes, if so, take the minimum */
+	if (cost)
+		for (i = 0; i < nnodes; i++)
+		{
+			if (src_node_mask & (1<<i))
+			{
+				double time = _starpu_predict_transfer_time(i, destination, size);
+				unsigned handling_node;
+
+				/* Avoid indirect transfers */
+				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
+					continue;
+
+				if (_STARPU_IS_ZERO(time))
+				{
+					/* No estimation, will have to revert to dumb strategy */
+					cost = 0.0;
+					break;
+				}
+				else if (time < cost)
+				{
+					cost = time;
+					src_node = i;
+				}
+			}
+		}
+
+	if (cost && src_node != -1)
+		/* Could estimate through cost, return that */
+		return src_node;
+
+	/* Revert to dumb strategy: take RAM unless only a GPU has it */
 	for (i = 0; i < nnodes; i++)
 	{
 		if (src_node_mask & (1<<i))
@@ -53,36 +94,41 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 			src_node = i;
 
 			/* however GPU are expensive sources, really !
-			 * 	other should be ok */
-		 
-			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM && _starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
+			 * 	Unless peer transfer is supported.
+			 * 	Other should be ok */
+
+			if (
+#ifndef HAVE_CUDA_MEMCPY_PEER
+					starpu_node_get_kind(i) != STARPU_CUDA_RAM &&
+#endif
+					starpu_node_get_kind(i) != STARPU_OPENCL_RAM)
 				break ;
-		 
-			/* XXX do a better algorithm to distribute the memory copies */
-			/* TODO : use the "requesting_node" as an argument to do so */
 		}
 	}
 
+	STARPU_ASSERT(src_node != -1);
+
 	return src_node;
 }
 
 /* this may be called once the data is fetched with header and STARPU_RW-lock hold */
-void _starpu_update_data_state(starpu_data_handle handle,
-				struct starpu_data_replicate_s *requesting_replicate,
-				starpu_access_mode mode)
+void _starpu_update_data_state(starpu_data_handle_t handle,
+			       struct _starpu_data_replicate *requesting_replicate,
+			       enum starpu_access_mode mode)
 {
 	/* There is nothing to do for relaxed coherency modes (scratch or
 	 * reductions) */
 	if (!(mode & STARPU_RW))
 		return;
 
-	unsigned nnodes = _starpu_get_memory_nodes_count();
+	unsigned nnodes = starpu_memory_nodes_get_count();
 
 	/* the data is present now */
 	unsigned requesting_node = requesting_replicate->memory_node;
 	requesting_replicate->requested[requesting_node] = 0;
 
-	if (mode & STARPU_W) {
+	if (mode & STARPU_W)
+	{
 		/* the requesting node now has the only valid copy */
 		uint32_t node;
 		for (node = 0; node < nnodes; node++)
@@ -90,14 +136,15 @@ void _starpu_update_data_state(starpu_data_handle handle,
 
 		requesting_replicate->state = STARPU_OWNER;
 	}
-	else { /* read only */
+	else
+	{ /* read only */
 		if (requesting_replicate->state != STARPU_OWNER)
 		{
 			/* there was at least another copy of the data */
 			uint32_t node;
 			for (node = 0; node < nnodes; node++)
 			{
-				struct starpu_data_replicate_s *replicate = &handle->per_node[node];
+				struct _starpu_data_replicate *replicate = &handle->per_node[node];
 				if (replicate->state != STARPU_INVALID)
 					replicate->state = STARPU_SHARED;
 			}
@@ -111,14 +158,18 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 	if (node == handling_node)
 		return 1;
 
-	int type = _starpu_get_node_kind(node);
+	if (!_starpu_memory_node_workers(handling_node))
+		/* No worker to process the request from that node */
+		return 0;
+
+	int type = starpu_node_get_kind(node);
 	switch (type)
 	{
 		case STARPU_CUDA_RAM:
 #ifdef HAVE_CUDA_MEMCPY_PEER
 			/* GPUs not always allow direct remote access: if CUDA4
 			 * is enabled, we allow two CUDA devices to communicate. */
-			return (_starpu_get_node_kind(handling_node) != STARPU_OPENCL_RAM);
+			return (starpu_node_get_kind(handling_node) != STARPU_OPENCL_RAM);
 #else
 			/* Direct GPU-GPU transfers are not allowed in general */
 			return 0;
@@ -130,13 +181,15 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 	}
 }
 
-static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
+static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
 {
+	(void) handle; // unused
+
 	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
 	 * method ! */
 #ifdef STARPU_USE_CUDA
-	if (src_node != dst_node && _starpu_get_node_kind(src_node) == STARPU_CUDA_RAM && _starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
+	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 	{
 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
 		if (!copy_methods->cuda_to_cuda_async)
@@ -163,11 +216,11 @@ static int link_supports_direct_transfers(starpu_data_handle handle, unsigned sr
  * node that handles the hop. The returned value indicates the number of hops,
  * and the max_len is the maximum number of hops (ie. the size of the
  * src_nodes, dst_nodes and handling_nodes arrays. */
-static int determine_request_path(starpu_data_handle handle,
-				unsigned src_node, unsigned dst_node,
-				starpu_access_mode mode, int max_len,
-				unsigned *src_nodes, unsigned *dst_nodes,
-				unsigned *handling_nodes)
+static int determine_request_path(starpu_data_handle_t handle,
+				  unsigned src_node, unsigned dst_node,
+				  enum starpu_access_mode mode, int max_len,
+				  unsigned *src_nodes, unsigned *dst_nodes,
+				  unsigned *handling_nodes)
 {
 	if (!(mode & STARPU_R))
 	{
@@ -182,7 +235,8 @@ static int determine_request_path(starpu_data_handle handle,
 	unsigned handling_node;
 	int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
 
-	if (!link_is_valid) {
+	if (!link_is_valid)
+	{
 		/* We need an intermediate hop to implement data staging
 		 * through main memory. */
 		STARPU_ASSERT(max_len >= 2);
@@ -201,15 +255,16 @@ static int determine_request_path(starpu_data_handle handle,
 
 		return 2;
 	}
-	else {
+	else
+	{
 		STARPU_ASSERT(max_len >= 1);
-		
+
 		src_nodes[0] = src_node;
 		dst_nodes[0] = dst_node;
 		handling_nodes[0] = handling_node;
 
 #ifndef HAVE_CUDA_MEMCPY_PEER
-		STARPU_ASSERT(!(mode & STARPU_R) || _starpu_get_node_kind(src_node) != STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) != STARPU_CUDA_RAM);
+		STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
 #endif
 
 		return 1;
@@ -219,9 +274,9 @@ static int determine_request_path(starpu_data_handle handle,
 /* handle->lock should be taken. r is returned locked. The node parameter
  * indicate either the source of the request, or the destination for a
  * write-only request. */
-static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, unsigned node, starpu_access_mode mode)
+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_access_mode mode, unsigned is_prefetch)
 {
-	starpu_data_request_t r;
+	struct _starpu_data_request *r;
 
 	r = replicate->request[node];
 
@@ -229,20 +284,28 @@ static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_
 	{
 		_starpu_spin_lock(&r->lock);
 
-		/* perhaps we need to "upgrade" the request */
+                /* perhaps we need to "upgrade" the request */
+		if (is_prefetch < r->prefetch)
+			_starpu_update_prefetch_status(r);
+
 		if (mode & STARPU_R)
 		{
 			/* in case the exisiting request did not imply a memory
-			 * transfer yet, we have to increment the refcnt now
+			 * transfer yet, we have to take a second refcnt now
+			 * for the source, in addition to the refcnt for the
+			 * destination
 			 * (so that the source remains valid) */
 			if (!(r->mode & STARPU_R))
+			{
 				replicate->refcnt++;
+				replicate->handle->busy_count++;
+			}
 
-			r->mode |= STARPU_R;
+			r->mode = (enum starpu_access_mode) ((int) r->mode | (int) STARPU_R);
 		}
 
 		if (mode & STARPU_W)
-			r->mode |= STARPU_W;
+			r->mode = (enum starpu_access_mode) ((int) r->mode | (int)  STARPU_W);
 	}
 
 	return r;
@@ -252,38 +315,53 @@ static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_
 
 /*
  * This function is called when the data is needed on the local node, this
- * returns a pointer to the local copy 
+ * returns a pointer to the local copy
  *
  *			R 	STARPU_W 	STARPU_RW
  *	Owner		OK	OK	OK
  *	Shared		OK	1	1
  *	Invalid		2	3	4
  *
- * case 1 : shared + (read)write : 
+ * case 1 : shared + (read)write :
  * 	no data copy but shared->Invalid/Owner
- * case 2 : invalid + read : 
+ * case 2 : invalid + read :
  * 	data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
- * case 3 : invalid + write : 
+ * case 3 : invalid + write :
  * 	no data copy + invalid->owner + (owner,shared)->invalid
- * case 4 : invalid + R/STARPU_W : 
- * 	data copy + if (STARPU_W) (invalid->owner + owner->invalid) 
+ * case 4 : invalid + R/STARPU_W :
+ * 	data copy + if (STARPU_W) (invalid->owner + owner->invalid)
  * 		    else (invalid,owner->shared)
  */
 
 /* This function is called with handle's header lock taken */
-starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
-				struct starpu_data_replicate_s *dst_replicate,
-                                starpu_access_mode mode, unsigned is_prefetch,
-                                void (*callback_func)(void *), void *callback_arg)
+struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
+								  struct _starpu_data_replicate *dst_replicate,
+								  enum starpu_access_mode mode, unsigned is_prefetch,
+								  unsigned async,
+								  void (*callback_func)(void *), void *callback_arg)
 {
 	unsigned requesting_node = dst_replicate->memory_node;
 
 	if (dst_replicate->state != STARPU_INVALID)
 	{
+#ifdef STARPU_MEMORY_STATUS
+		enum _starpu_cache_state old_state = dst_replicate->state;
+#endif
 		/* the data is already available so we can stop */
 		_starpu_update_data_state(handle, dst_replicate, mode);
 		_starpu_msi_cache_hit(requesting_node);
 
+#ifdef STARPU_MEMORY_STATUS
+		_starpu_handle_stats_cache_hit(handle, requesting_node);
+
+		/* XXX Broken ? */
+		if (old_state == STARPU_SHARED
+		    && dst_replicate->state == STARPU_OWNER)
+			_starpu_handle_stats_shared_to_owner(handle, requesting_node);
+#endif
+
+		_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
+
 		_starpu_spin_unlock(&handle->header_lock);
 
 		if (callback_func)
@@ -304,7 +382,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 	/* if the data is in write only mode, there is no need for a source */
 	if (mode & STARPU_R)
 	{
-		src_node = _starpu_select_src_node(handle);
+		src_node = _starpu_select_src_node(handle, requesting_node);
 		STARPU_ASSERT(src_node != requesting_node);
 	}
 
@@ -313,9 +391,9 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 	unsigned src_nodes[4], dst_nodes[4], handling_nodes[4];
 	int nhops = determine_request_path(handle, src_node, requesting_node, mode, 4,
 					src_nodes, dst_nodes, handling_nodes);
-	STARPU_ASSERT(nhops <= 4);
 
-	starpu_data_request_t requests[nhops];
+	STARPU_ASSERT(nhops >= 1 && nhops <= 4);
+	struct _starpu_data_request *requests[nhops];
 
 	/* Did we reuse a request for that hop ? */
 	int reused_requests[nhops];
@@ -324,14 +402,14 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 	int hop;
 	for (hop = 0; hop < nhops; hop++)
 	{
-		starpu_data_request_t r;
+		struct _starpu_data_request *r;
 
 		unsigned hop_src_node = src_nodes[hop];
 		unsigned hop_dst_node = dst_nodes[hop];
 		unsigned hop_handling_node = handling_nodes[hop];
 
-		struct starpu_data_replicate_s *hop_src_replicate;
-		struct starpu_data_replicate_s *hop_dst_replicate;
+		struct _starpu_data_replicate *hop_src_replicate;
+		struct _starpu_data_replicate *hop_dst_replicate;
 
 		/* Only the first request is independant */
 		unsigned ndeps = (hop == 0)?0:1;
@@ -341,24 +419,26 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
 		/* Try to reuse a request if possible */
 		r = _starpu_search_existing_data_request(hop_dst_replicate,
-				(mode & STARPU_R)?hop_src_node:hop_dst_node, mode);
+				(mode & STARPU_R)?hop_src_node:hop_dst_node,
+							 mode, is_prefetch);
 
 		reused_requests[hop] = !!r;
 
-		if (!r) {
+		if (!r)
+		{
 			/* Create a new request if there was no request to reuse */
 			r = _starpu_create_data_request(handle, hop_src_replicate,
-					hop_dst_replicate, hop_handling_node,
-					mode, ndeps);
+							hop_dst_replicate, hop_handling_node,
+							mode, ndeps, is_prefetch);
 		}
 
-		requests[hop] = r; 
+		requests[hop] = r;
 	}
 
 	/* Chain these requests */
 	for (hop = 0; hop < nhops; hop++)
 	{
-		starpu_data_request_t r;
+		struct _starpu_data_request *r;
 		r = requests[hop];
 
 		if (hop != nhops - 1)
@@ -374,7 +454,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 			_starpu_spin_unlock(&r->lock);
 	}
 
-	if (!is_prefetch)
+	if (!async)
 		requests[nhops - 1]->refcnt++;
 
 
@@ -386,9 +466,9 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 	return requests[nhops - 1];
 }
 
-int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *dst_replicate,
-				starpu_access_mode mode, unsigned is_prefetch,
-				void (*callback_func)(void *), void *callback_arg)
+int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *dst_replicate,
+			       enum starpu_access_mode mode, unsigned detached, unsigned async,
+			       void (*callback_func)(void *), void *callback_arg)
 {
 	uint32_t local_node = _starpu_get_local_memory_node();
         _STARPU_LOG_IN();
@@ -396,57 +476,62 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_rep
 	while (_starpu_spin_trylock(&handle->header_lock))
 		_starpu_datawizard_progress(local_node, 1);
 
-	if (!is_prefetch)
+	if (!detached)
+	{
+		/* Take a reference which will be released by _starpu_release_data_on_node */
 		dst_replicate->refcnt++;
+		dst_replicate->handle->busy_count++;
+	}
 
-	starpu_data_request_t r;
-	r = create_request_to_fetch_data(handle, dst_replicate, mode,
-					is_prefetch, callback_func, callback_arg);
+	struct _starpu_data_request *r;
+	r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
+						 detached, async, callback_func, callback_arg);
 
 	/* If no request was created, the handle was already up-to-date on the
-	 * node. In this case, create_request_to_fetch_data has already
+	 * node. In this case, _starpu_create_request_to_fetch_data has already
 	 * unlocked the header. */
 	if (!r)
 		return 0;
-	
+
 	_starpu_spin_unlock(&handle->header_lock);
 
-	int ret = is_prefetch?0:_starpu_wait_data_request_completion(r, 1);
+	int ret = async?0:_starpu_wait_data_request_completion(r, 1);
         _STARPU_LOG_OUT();
         return ret;
 }
 
-static int prefetch_data_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, starpu_access_mode mode)
+static int prefetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_access_mode mode)
 {
-	return _starpu_fetch_data_on_node(handle, replicate, mode, 1, NULL, NULL);
+	return _starpu_fetch_data_on_node(handle, replicate, mode, 1, 1, NULL, NULL);
 }
 
-static int fetch_data(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, starpu_access_mode mode)
+static int fetch_data(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_access_mode mode)
 {
-	return _starpu_fetch_data_on_node(handle, replicate, mode, 0, NULL, NULL);
+	return _starpu_fetch_data_on_node(handle, replicate, mode, 0, 0, NULL, NULL);
 }
 
-uint32_t _starpu_get_data_refcnt(starpu_data_handle handle, uint32_t node)
+uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, uint32_t node)
 {
 	return handle->per_node[node].refcnt;
 }
 
-size_t _starpu_data_get_size(starpu_data_handle handle)
+size_t _starpu_data_get_size(starpu_data_handle_t handle)
 {
 	return handle->data_size;
 }
 
-uint32_t _starpu_data_get_footprint(starpu_data_handle handle)
+uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 {
 	return handle->footprint;
 }
 
-/* in case the data was accessed on a write mode, do not forget to 
+/* in case the data was accessed on a write mode, do not forget to
  * make it accessible again once it is possible ! */
-void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt_mask, struct starpu_data_replicate_s *replicate)
+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
 {
 	uint32_t wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
+	wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
 
 	/* Note that it is possible that there is no valid copy of the data (if
 	 * starpu_data_invalidate was called for instance). In that case, we do
@@ -454,7 +539,7 @@ void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt
 
 	unsigned memory_node = replicate->memory_node;
 
-	if (replicate->state != STARPU_INVALID)
+	if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
 	if ((wt_mask & ~(1<<memory_node)))
 		_starpu_write_through_data(handle, memory_node, wt_mask);
 
@@ -462,10 +547,14 @@ void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt
 	while (_starpu_spin_trylock(&handle->header_lock))
 		_starpu_datawizard_progress(local_node, 1);
 
+	/* Release refcnt taken by fetch_data_on_node */
 	replicate->refcnt--;
-
 	STARPU_ASSERT(replicate->refcnt >= 0);
 
+	STARPU_ASSERT(handle->busy_count > 0);
+	handle->busy_count--;
+	_starpu_data_check_not_busy(handle);
+
 	/* In case there was a temporary handle (eg. used for reduction), this
 	 * handle may have requested to be destroyed when the data is released
 	 * */
@@ -477,35 +566,34 @@ void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
-static void _starpu_set_data_requested_flag_if_needed(struct starpu_data_replicate_s *replicate)
+static void _starpu_set_data_requested_flag_if_needed(struct _starpu_data_replicate *replicate)
 {
 // XXX : this is just a hint, so we don't take the lock ...
-//	pthread_spin_lock(&handle->header_lock);
+//	_STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
 
-	if (replicate->state == STARPU_INVALID) 
+	if (replicate->state == STARPU_INVALID)
 	{
 		unsigned dst_node = replicate->memory_node;
 		replicate->requested[dst_node] = 1;
 	}
 
-//	pthread_spin_unlock(&handle->header_lock);
+//	_STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
 }
 
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node)
 {
-	starpu_buffer_descr *descrs = task->buffers;
 	unsigned nbuffers = task->cl->nbuffers;
-
 	unsigned index;
+
 	for (index = 0; index < nbuffers; index++)
 	{
-		starpu_data_handle handle = descrs[index].handle;
-		starpu_access_mode mode = descrs[index].mode;
+		starpu_data_handle_t handle = task->handles[index];
+		enum starpu_access_mode mode = task->cl->modes[index];
 
 		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 			continue;
 
-		struct starpu_data_replicate_s *replicate = &handle->per_node[node];
+		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		prefetch_data_on_node(handle, replicate, mode);
 
 		_starpu_set_data_requested_flag_if_needed(replicate);
@@ -514,15 +602,25 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node)
 	return 0;
 }
 
-int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
+static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_access_mode mode, int workerid, unsigned local_memory_node)
+{
+	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
+		return &handle->per_worker[workerid];
+	else
+		/* That's a "normal" buffer (R/W) */
+		return &handle->per_node[local_memory_node];
+}
+
+int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 {
-	STARPU_TRACE_START_FETCH_INPUT(NULL);
+	_STARPU_TRACE_START_FETCH_INPUT(NULL);
 
 	int profiling = starpu_profiling_status_get();
+	struct starpu_task *task = j->task;
 	if (profiling && task->profiling_info)
-		starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
+		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
 
-	starpu_buffer_descr *descrs = task->buffers;
+	struct starpu_buffer_descr *descrs = j->ordered_buffers;
 	unsigned nbuffers = task->cl->nbuffers;
 
 	unsigned local_memory_node = _starpu_get_local_memory_node();
@@ -533,23 +631,33 @@ int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
 	for (index = 0; index < nbuffers; index++)
 	{
 		int ret;
-		starpu_data_handle handle = descrs[index].handle;
-		starpu_access_mode mode = descrs[index].mode;
+		starpu_data_handle_t handle = descrs[index].handle;
+		enum starpu_access_mode mode = descrs[index].mode;
 
-		struct starpu_data_replicate_s *local_replicate;
+		struct _starpu_data_replicate *local_replicate;
 
-		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
-		{
-			local_replicate = &handle->per_worker[workerid];
-		}
-		else {
-			/* That's a "normal" buffer (R/W) */
-			local_replicate = &handle->per_node[local_memory_node];
-		}
+		if (index && descrs[index-1].handle == descrs[index].handle)
+			/* We have already took this data, skip it. This
+			 * depends on ordering putting writes before reads, see
+			 * _starpu_compar_handles */
+			continue;
+
+		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
 
 		ret = fetch_data(handle, local_replicate, mode);
 		if (STARPU_UNLIKELY(ret))
 			goto enomem;
+	}
+
+	/* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order.  */
+	for (index = 0; index < nbuffers; index++)
+	{
+		starpu_data_handle_t handle = task->handles[index];
+		enum starpu_access_mode mode = task->cl->modes[index];
+
+		struct _starpu_data_replicate *local_replicate;
+
+		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
 
 		task->interfaces[index] = local_replicate->data_interface;
 
@@ -562,9 +670,9 @@ int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
 	}
 
 	if (profiling && task->profiling_info)
-		starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
+		_starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
 
-	STARPU_TRACE_END_FETCH_INPUT(NULL);
+	_STARPU_TRACE_END_FETCH_INPUT(NULL);
 
 	return 0;
 
@@ -573,39 +681,40 @@ enomem:
 	/* XXX broken ... */
 	_STARPU_DISP("something went wrong with buffer %u\n", index);
 	//push_codelet_output(task, index, mask);
-	_starpu_push_task_output(task, mask);
+	_starpu_push_task_output(j, mask);
 	return -1;
 }
 
-void _starpu_push_task_output(struct starpu_task *task, uint32_t mask)
+void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 {
-	STARPU_TRACE_START_PUSH_OUTPUT(NULL);
+	_STARPU_TRACE_START_PUSH_OUTPUT(NULL);
 
 	int profiling = starpu_profiling_status_get();
+	struct starpu_task *task = j->task;
 	if (profiling && task->profiling_info)
-		starpu_clock_gettime(&task->profiling_info->release_data_start_time);
+		_starpu_clock_gettime(&task->profiling_info->release_data_start_time);
 
-        starpu_buffer_descr *descrs = task->buffers;
+        struct starpu_buffer_descr *descrs = j->ordered_buffers;
         unsigned nbuffers = task->cl->nbuffers;
 
+	int workerid = starpu_worker_get_id();
+	unsigned local_memory_node = _starpu_get_local_memory_node();
+
 	unsigned index;
 	for (index = 0; index < nbuffers; index++)
 	{
-		starpu_data_handle handle = descrs[index].handle;
-		starpu_access_mode mode = descrs[index].mode;
+		starpu_data_handle_t handle = descrs[index].handle;
+		enum starpu_access_mode mode = descrs[index].mode;
 
-		struct starpu_data_replicate_s *replicate;
+		struct _starpu_data_replicate *local_replicate;
 
-		if (mode & STARPU_RW)
-		{
-			unsigned local_node = _starpu_get_local_memory_node();
-			replicate = &handle->per_node[local_node];
-		}
-		else
-		{
-			int workerid = starpu_worker_get_id();
-			replicate = &handle->per_worker[workerid];
-		}
+		if (index && descrs[index-1].handle == descrs[index].handle)
+			/* We have already released this data, skip it. This
+			 * depends on ordering putting writes before reads, see
+			 * _starpu_compar_handles */
+			continue;
+
+		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
 
 		/* In case there was a temporary handle (eg. used for
 		 * reduction), this handle may have requested to be destroyed
@@ -613,33 +722,34 @@ void _starpu_push_task_output(struct starpu_task *task, uint32_t mask)
 		 * */
 		unsigned handle_was_destroyed = handle->lazy_unregister;
 
-		_starpu_release_data_on_node(handle, mask, replicate);
+		_starpu_release_data_on_node(handle, mask, local_replicate);
 		if (!handle_was_destroyed)
 			_starpu_release_data_enforce_sequential_consistency(task, handle);
 	}
 
 	if (profiling && task->profiling_info)
-		starpu_clock_gettime(&task->profiling_info->release_data_end_time);
+		_starpu_clock_gettime(&task->profiling_info->release_data_end_time);
 
-	STARPU_TRACE_END_PUSH_OUTPUT(NULL);
+	_STARPU_TRACE_END_PUSH_OUTPUT(NULL);
 }
 
 /* NB : this value can only be an indication of the status of a data
 	at some point, but there is no strong garantee ! */
-unsigned _starpu_is_data_present_or_requested(starpu_data_handle handle, uint32_t node)
+unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, uint32_t node)
 {
 	unsigned ret = 0;
 
 // XXX : this is just a hint, so we don't take the lock ...
-//	pthread_spin_lock(&handle->header_lock);
+//	_STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
 
 	if (handle->per_node[node].state != STARPU_INVALID)
 	{
 		ret  = 1;
 	}
-	else {
+	else
+	{
 		unsigned i;
-		unsigned nnodes = _starpu_get_memory_nodes_count();
+		unsigned nnodes = starpu_memory_nodes_get_count();
 
 		for (i = 0; i < nnodes; i++)
 		{
@@ -649,7 +759,7 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle handle, uint32_
 
 	}
 
-//	pthread_spin_unlock(&handle->header_lock);
+//	_STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
 
 	return ret;
 }

+ 106 - 64
src/datawizard/coherency.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,15 +31,16 @@
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/datastats.h>
 
-typedef enum {
+enum _starpu_cache_state
+{
 	STARPU_OWNER,
 	STARPU_SHARED,
 	STARPU_INVALID
-} starpu_cache_state;
+};
 
 /* this should contain the information relative to a given data replicate  */
-LIST_TYPE(starpu_data_replicate,
-	starpu_data_handle handle;
+LIST_TYPE(_starpu_data_replicate,
+	starpu_data_handle_t handle;
 
 	/* describe the actual data layout */
 	void *data_interface;
@@ -55,65 +56,83 @@ LIST_TYPE(starpu_data_replicate,
 	unsigned initialized;
 
 	/* describes the state of the local data in term of coherency */
-	starpu_cache_state	state; 
+	enum _starpu_cache_state	state;
 
 	int refcnt;
 
 	/* is the data locally allocated ? */
-	uint8_t allocated; 
-	/* was it automatically allocated ? */
-	/* perhaps the allocation was perform higher in the hiearchy 
+	uint8_t allocated;
+	/* was it automatically allocated ? (else it's the application-provided
+	 * buffer, don't ever try to free it!) */
+	/* perhaps the allocation was perform higher in the hiearchy
 	 * for now this is just translated into !automatically_allocated
 	 * */
 	uint8_t automatically_allocated;
 
+        /* Pointer to memchunk for LRU strategy */
+	struct _starpu_mem_chunk * mc;
+
 	/* To help the scheduling policies to make some decision, we
-	   may keep a track of the tasks that are likely to request 
+	   may keep a track of the tasks that are likely to request
 	   this data on the current node.
 	   It is the responsability of the scheduling _policy_ to set that
 	   flag when it assigns a task to a queue, policies which do not
 	   use this hint can simply ignore it.
 	 */
 	uint8_t requested[STARPU_MAXNODES];
-	struct starpu_data_request_s *request[STARPU_MAXNODES];
-);
+	struct _starpu_data_request *request[STARPU_MAXNODES];
+)
 
-struct starpu_data_requester_list_s;
+struct _starpu_data_requester_list;
 
-struct starpu_jobid_list {
+struct _starpu_jobid_list
+{
 	unsigned long id;
-	struct starpu_jobid_list *next;
+	struct _starpu_jobid_list *next;
 };
 
 /* This structure describes a simply-linked list of task */
-struct starpu_task_wrapper_list {
+struct _starpu_task_wrapper_list
+{
 	struct starpu_task *task;
-	struct starpu_task_wrapper_list *next;
+	struct _starpu_task_wrapper_list *next;
 };
 
-struct starpu_data_state_t {
-	struct starpu_data_requester_list_s *req_list;
-	/* the number of requests currently in the scheduling engine
-	 * (not in the req_list anymore) */
+struct _starpu_data_state
+{
+	struct _starpu_data_requester_list *req_list;
+	/* the number of requests currently in the scheduling engine (not in
+	 * the req_list anymore), i.e. the number of holders of the
+	 * current_mode rwlock */
 	unsigned refcnt;
-	starpu_access_mode current_mode;
+	enum starpu_access_mode current_mode;
 	/* protect meta data */
-	starpu_spinlock_t header_lock;
+	struct _starpu_spinlock header_lock;
+
+	/* Condition to make application wait for all transfers before freeing handle */
+	/* busy_count is the number of handle->refcnt, handle->per_node[*]->refcnt, and number of starpu_data_requesters */
+	/* Core code which releases busy_count has to call
+	 * _starpu_data_check_not_busy to let starpu_data_unregister proceed */
+	unsigned busy_count;
+	/* Is starpu_data_unregister waiting for busy_count? */
+	unsigned busy_waiting;
+	pthread_mutex_t busy_mutex;
+	pthread_cond_t busy_cond;
 
 	/* In case we user filters, the handle may describe a sub-data */
-	struct starpu_data_state_t *root_handle; /* root of the tree */
-	struct starpu_data_state_t *father_handle; /* father of the node, NULL if the current node is the root */
+	struct _starpu_data_state *root_handle; /* root of the tree */
+	struct _starpu_data_state *father_handle; /* father of the node, NULL if the current node is the root */
 	unsigned sibling_index; /* indicate which child this node is from the father's perpsective (if any) */
 	unsigned depth; /* what's the depth of the tree ? */
 
-	struct starpu_data_state_t *children;
+	struct _starpu_data_state *children;
 	unsigned nchildren;
 
 	/* describe the state of the data in term of coherency */
-	struct starpu_data_replicate_s per_node[STARPU_MAXNODES];
-	struct starpu_data_replicate_s per_worker[STARPU_NMAXWORKERS];
+	struct _starpu_data_replicate per_node[STARPU_MAXNODES];
+	struct _starpu_data_replicate per_worker[STARPU_NMAXWORKERS];
 
-	struct starpu_data_interface_ops_t *ops;
+	struct starpu_data_interface_ops *ops;
 
 	/* To avoid recomputing data size all the time, we store it directly. */
 	size_t data_size;
@@ -140,14 +159,14 @@ struct starpu_data_state_t {
 	/* This lock should protect any operation to enforce
 	 * sequential_consistency */
 	pthread_mutex_t sequential_consistency_mutex;
-	
+
 	/* The last submitted task (or application data request) that declared
 	 * it would modify the piece of data ? Any task accessing the data in a
 	 * read-only mode should depend on that task implicitely if the
 	 * sequential_consistency flag is enabled. */
-	starpu_access_mode last_submitted_mode;
+	enum starpu_access_mode last_submitted_mode;
 	struct starpu_task *last_submitted_writer;
-	struct starpu_task_wrapper_list *last_submitted_readers;
+	struct _starpu_task_wrapper_list *last_submitted_readers;
 
 	/* If FxT is enabled, we keep track of "ghost dependencies": that is to
 	 * say the dependencies that are not needed anymore, but that should
@@ -157,9 +176,9 @@ struct starpu_data_state_t {
 	 * enforce this dependency anymore.*/
 	unsigned last_submitted_ghost_writer_id_is_valid;
 	unsigned long last_submitted_ghost_writer_id;
-	struct starpu_jobid_list *last_submitted_ghost_readers_id;
-	
-	struct starpu_task_wrapper_list *post_sync_tasks;
+	struct _starpu_jobid_list *last_submitted_ghost_readers_id;
+
+	struct _starpu_task_wrapper_list *post_sync_tasks;
 	unsigned post_sync_tasks_cnt;
 
 	/*
@@ -170,8 +189,8 @@ struct starpu_data_state_t {
 	 * the reduction of an interface into another one (eg. "+="), and init_func
 	 * initializes the data interface to a default value that is stable by
 	 * reduction (eg. 0 for +=). */
-	struct starpu_codelet_t *redux_cl;
-	struct starpu_codelet_t *init_cl;
+	struct starpu_codelet *redux_cl;
+	struct starpu_codelet *init_cl;
 
 	/* Are we currently performing a reduction on that handle ? If so the
 	 * reduction_refcnt should be non null until there are pending tasks
@@ -181,54 +200,77 @@ struct starpu_data_state_t {
 	/* List of requesters that are specific to the pending reduction. This
 	 * list is used when the requests in the req_list list are frozen until
 	 * the end of the reduction. */
-	struct starpu_data_requester_list_s *reduction_req_list;
+	struct _starpu_data_requester_list *reduction_req_list;
 
-	starpu_data_handle reduction_tmp_handles[STARPU_NMAXWORKERS];
+	starpu_data_handle_t reduction_tmp_handles[STARPU_NMAXWORKERS];
 
 	unsigned lazy_unregister;
 
         /* Used for MPI */
         int rank;
 	int tag;
+
+#ifdef STARPU_MEMORY_STATUS
+	/* Handle access stats per node */
+	unsigned stats_direct_access[STARPU_MAXNODES];
+	unsigned stats_loaded_shared[STARPU_MAXNODES];
+	unsigned stats_loaded_owner[STARPU_MAXNODES];
+	unsigned stats_shared_to_owner[STARPU_MAXNODES];
+	unsigned stats_invalidated[STARPU_MAXNODES];
+#endif
+
+	unsigned int mf_node; //XXX
 };
 
 void _starpu_display_msi_stats(void);
 
-int _starpu_fetch_data_on_node(struct starpu_data_state_t *state, struct starpu_data_replicate_s *replicate,
-				starpu_access_mode mode, unsigned is_prefetch,
-				void (*callback_func)(void *), void *callback_arg);
-void _starpu_release_data_on_node(struct starpu_data_state_t *state, uint32_t default_wt_mask,
-				struct starpu_data_replicate_s *replicate);
+/* This does not take a reference on the handle, the caller has to do it,
+ * e.g. through _starpu_attempt_to_submit_data_request_from_apps()
+ * detached means that the core is allowed to drop the request. The caller
+ * should thus *not* take a reference since it can not know whether the request will complete
+ * async means that _starpu_fetch_data_on_node will wait for completion of the request
+ */
+int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate,
+			       enum starpu_access_mode mode, unsigned detached, unsigned async,
+			       void (*callback_func)(void *), void *callback_arg);
+/* This releases a reference on the handle */
+void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
+				  struct _starpu_data_replicate *replicate);
 
-void _starpu_update_data_state(starpu_data_handle handle,
-				struct starpu_data_replicate_s *requesting_replicate,
-				starpu_access_mode mode);
+void _starpu_update_data_state(starpu_data_handle_t handle,
+			       struct _starpu_data_replicate *requesting_replicate,
+			       enum starpu_access_mode mode);
 
-uint32_t _starpu_get_data_refcnt(struct starpu_data_state_t *state, uint32_t node);
+uint32_t _starpu_get_data_refcnt(struct _starpu_data_state *state, uint32_t node);
 
-size_t _starpu_data_get_size(starpu_data_handle handle);
+size_t _starpu_data_get_size(starpu_data_handle_t handle);
 
-uint32_t _starpu_data_get_footprint(starpu_data_handle handle);
+uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle);
 
-void _starpu_push_task_output(struct starpu_task *task, uint32_t mask);
+void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask);
 
 __attribute__((warn_unused_result))
-int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask);
+int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask);
 
-unsigned _starpu_is_data_present_or_requested(struct starpu_data_state_t *state, uint32_t node);
-unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node);
+unsigned _starpu_is_data_present_or_requested(struct _starpu_data_state *state, uint32_t node);
+unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, uint32_t memory_node);
 
 
-uint32_t _starpu_select_src_node(struct starpu_data_state_t *state);
+uint32_t _starpu_select_src_node(struct _starpu_data_state *state, unsigned destination);
 
-starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
-				struct starpu_data_replicate_s *dst_replicate,
-                                starpu_access_mode mode, unsigned is_prefetch,
-                                void (*callback_func)(void *), void *callback_arg);
-
-void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, int workerid);
-void starpu_data_start_reduction_mode(starpu_data_handle handle);
-void starpu_data_end_reduction_mode(starpu_data_handle handle);
-void starpu_data_end_reduction_mode_terminate(starpu_data_handle handle);
+/* is_prefetch is whether the DSM may drop the request (when there is not enough memory for instance
+ * async is whether the caller wants a reference on the last request, to be
+ * able to wait for it (which will release that reference).
+ */
+struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
+								  struct _starpu_data_replicate *dst_replicate,
+								  enum starpu_access_mode mode, unsigned is_prefetch,
+								  unsigned async,
+								  void (*callback_func)(void *), void *callback_arg);
+
+void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, int workerid);
+void _starpu_data_start_reduction_mode(starpu_data_handle_t handle);
+void _starpu_data_end_reduction_mode(starpu_data_handle_t handle);
+void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle);
 
 #endif // __COHERENCY__H__

+ 123 - 97
src/datawizard/copy_driver.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -32,23 +32,23 @@ void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid)
 	/* wake up all workers on that memory node */
 	unsigned cond_id;
 
-	starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
+	struct _starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
 
-	PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
+	_STARPU_PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
 
 	unsigned nconds = descr->condition_count[nodeid];
 	for (cond_id = 0; cond_id < nconds; cond_id++)
 	{
-		struct _cond_and_mutex *condition;
+		struct _starpu_cond_and_mutex *condition;
 		condition  = &descr->conditions_attached_to_node[nodeid][cond_id];
 
 		/* wake anybody waiting on that condition */
-		PTHREAD_MUTEX_LOCK(condition->mutex);
-		PTHREAD_COND_BROADCAST(condition->cond);
-		PTHREAD_MUTEX_UNLOCK(condition->mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(condition->mutex);
+		_STARPU_PTHREAD_COND_BROADCAST(condition->cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(condition->mutex);
 	}
 
-	PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
 }
 
 void starpu_wake_all_blocked_workers(void)
@@ -56,23 +56,23 @@ void starpu_wake_all_blocked_workers(void)
 	/* workers may be blocked on the various queues' conditions */
 	unsigned cond_id;
 
-	starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
+	struct _starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
 
-	PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
+	_STARPU_PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
 
 	unsigned nconds = descr->total_condition_count;
 	for (cond_id = 0; cond_id < nconds; cond_id++)
 	{
-		struct _cond_and_mutex *condition;
+		struct _starpu_cond_and_mutex *condition;
 		condition  = &descr->conditions_all[cond_id];
 
 		/* wake anybody waiting on that condition */
-		PTHREAD_MUTEX_LOCK(condition->mutex);
-		PTHREAD_COND_BROADCAST(condition->cond);
-		PTHREAD_MUTEX_UNLOCK(condition->mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(condition->mutex);
+		_STARPU_PTHREAD_COND_BROADCAST(condition->cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(condition->mutex);
 	}
 
-	PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
 }
 
 #ifdef STARPU_USE_FXT
@@ -82,7 +82,10 @@ void starpu_wake_all_blocked_workers(void)
 static unsigned communication_cnt = 0;
 #endif
 
-static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_data_replicate_s *src_replicate, struct starpu_data_replicate_s *dst_replicate, struct starpu_data_request_s *req STARPU_ATTRIBUTE_UNUSED)
+static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
+				    struct _starpu_data_replicate *src_replicate,
+				    struct _starpu_data_replicate *dst_replicate,
+				    struct _starpu_data_request *req STARPU_ATTRIBUTE_UNUSED)
 {
 	int ret = 0;
 
@@ -91,8 +94,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	unsigned src_node = src_replicate->memory_node;
 	unsigned dst_node = dst_replicate->memory_node;
 
-	starpu_node_kind src_kind = _starpu_get_node_kind(src_node);
-	starpu_node_kind dst_kind = _starpu_get_node_kind(dst_node);
+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
 
 	STARPU_ASSERT(src_replicate->refcnt);
 	STARPU_ASSERT(dst_replicate->refcnt);
@@ -105,6 +108,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	cudaStream_t stream;
 #endif
 
+	_starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle));
+
 	void *src_interface = src_replicate->data_interface;
 	void *dst_interface = dst_replicate->data_interface;
 
@@ -112,12 +117,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
 	{
 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
-		cures = cudaSetDevice(starpu_memory_node_to_devid(node));
+		cures = cudaSetDevice(_starpu_memory_node_to_devid(node));
 		STARPU_ASSERT(cures == cudaSuccess);
 	}
 #endif
 
-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) {
+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	{
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
 		STARPU_ASSERT(copy_methods->ram_to_ram);
@@ -126,12 +132,17 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
 		/* only the proper CUBLAS thread can initiate this directly ! */
+#if !defined(HAVE_CUDA_MEMCPY_PEER)
+		STARPU_ASSERT(_starpu_get_local_memory_node() == src_node);
+#endif
 		STARPU_ASSERT(copy_methods->cuda_to_ram);
-		if (!req || !copy_methods->cuda_to_ram_async) {
+		if (!req || !copy_methods->cuda_to_ram_async)
+		{
 			/* this is not associated to a request so it's synchronous */
 			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
 		}
-		else {
+		else
+		{
 			req->async_channel.type = STARPU_CUDA_RAM;
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
@@ -146,19 +157,23 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
 		/* STARPU_CPU_RAM -> CUBLAS_RAM */
 		/* only the proper CUBLAS thread can initiate this ! */
+#if !defined(HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
+#endif
 		STARPU_ASSERT(copy_methods->ram_to_cuda);
-		if (!req || !copy_methods->ram_to_cuda_async) {
+		if (!req || !copy_methods->ram_to_cuda_async)
+		{
 			/* this is not associated to a request so it's synchronous */
 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
 		}
-		else {
+		else
+		{
 			req->async_channel.type = STARPU_CUDA_RAM;
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_stream();
+			stream = starpu_cuda_get_local_transfer_stream();
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
@@ -169,16 +184,19 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
 		/* CUDA - CUDA transfer */
 		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
-		if (!req || !copy_methods->cuda_to_cuda_async) {
+		if (!req || !copy_methods->cuda_to_cuda_async)
+		{
+			STARPU_ASSERT(copy_methods->cuda_to_cuda);
 			/* this is not associated to a request so it's synchronous */
 			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
 		}
-		else {
+		else
+		{
 			req->async_channel.type = STARPU_CUDA_RAM;
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_stream();
+			stream = starpu_cuda_get_local_transfer_stream();
 			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
@@ -189,18 +207,22 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 #ifdef STARPU_USE_OPENCL
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
 		/* OpenCL -> RAM */
-		if (_starpu_get_local_memory_node() == src_node) {
+		if (_starpu_get_local_memory_node() == src_node)
+		{
 			STARPU_ASSERT(copy_methods->opencl_to_ram);
-			if (!req || !copy_methods->opencl_to_ram_async) {
+			if (!req || !copy_methods->opencl_to_ram_async)
+			{
 				/* this is not associated to a request so it's synchronous */
 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
 			}
-			else {
+			else
+			{
 				req->async_channel.type = STARPU_OPENCL_RAM;
 				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
 			}
 		}
-		else {
+		else
+		{
 			/* we should not have a blocking call ! */
 			STARPU_ABORT();
 		}
@@ -209,11 +231,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
 		STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
 		STARPU_ASSERT(copy_methods->ram_to_opencl);
-		if (!req || !copy_methods->ram_to_opencl_async) {
+		if (!req || !copy_methods->ram_to_opencl_async)
+		{
 			/* this is not associated to a request so it's synchronous */
 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
 		}
-		else {
+		else
+		{
 			req->async_channel.type = STARPU_OPENCL_RAM;
 			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
 		}
@@ -227,12 +251,12 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	return ret;
 }
 
-int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
-						struct starpu_data_replicate_s *src_replicate,
-						struct starpu_data_replicate_s *dst_replicate,
-						unsigned donotread,
-						struct starpu_data_request_s *req,
-						unsigned may_alloc)
+int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
+									struct _starpu_data_replicate *src_replicate,
+									struct _starpu_data_replicate *dst_replicate,
+									unsigned donotread,
+									struct _starpu_data_request *req,
+									unsigned may_alloc)
 {
 	if (!donotread)
 	{
@@ -252,7 +276,7 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 		if (!may_alloc)
 			return -ENOMEM;
 
-		ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate);
+		ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate,req->prefetch);
 		if (ret_alloc)
 			return -ENOMEM;
 	}
@@ -260,12 +284,13 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	STARPU_ASSERT(dst_replicate->allocated);
 	STARPU_ASSERT(dst_replicate->refcnt);
 
-	/* if there is no need to actually read the data, 
+	/* if there is no need to actually read the data,
 	 * we do not perform any transfer */
-	if (!donotread) {
+	if (!donotread)
+	{
 		size_t size = _starpu_data_get_size(handle);
 		_starpu_bus_update_profiling_info((int)src_node, (int)dst_node, size);
-		
+
 #ifdef STARPU_USE_FXT
 		com_id = STARPU_ATOMIC_ADD(&communication_cnt, 1);
 
@@ -273,13 +298,13 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 			req->com_id = com_id;
 #endif
 
-		STARPU_TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id);
+		_STARPU_TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id);
 		ret_copy = copy_data_1_to_1_generic(handle, src_replicate, dst_replicate, req);
 
 #ifdef STARPU_USE_FXT
 		if (ret_copy != -EAGAIN)
 		{
-			STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id);
+			_STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id);
 		}
 #endif
 
@@ -289,83 +314,84 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 }
 
-void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel)
+void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
 {
-	starpu_node_kind kind = async_channel->type;
+	enum starpu_node_kind kind = async_channel->type;
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
 	cudaError_t cures;
 #endif
 
-	switch (kind) {
+	switch (kind)
+	{
 #ifdef STARPU_USE_CUDA
-		case STARPU_CUDA_RAM:
-			event = (*async_channel).event.cuda_event;
+	case STARPU_CUDA_RAM:
+		event = (*async_channel).event.cuda_event;
 
-			cures = cudaEventSynchronize(event);
-			if (STARPU_UNLIKELY(cures))
-				STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaEventSynchronize(event);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
 
-			cures = cudaEventDestroy(event);
-			if (STARPU_UNLIKELY(cures))
-				STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaEventDestroy(event);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
 
-			break;
+		break;
 #endif
 #ifdef STARPU_USE_OPENCL
-      case STARPU_OPENCL_RAM:
-         {
-                 if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
-                 cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
-                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-                 clReleaseEvent((*async_channel).event.opencl_event);
-         }
-         break;
+	case STARPU_OPENCL_RAM:
+	{
+		if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
+		cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+		clReleaseEvent((*async_channel).event.opencl_event);
+	      break;
+	}
 #endif
-		case STARPU_CPU_RAM:
-		default:
-			STARPU_ABORT();
+	case STARPU_CPU_RAM:
+	default:
+		STARPU_ABORT();
 	}
 }
 
-unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel)
+unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel)
 {
-	starpu_node_kind kind = async_channel->type;
+	enum starpu_node_kind kind = async_channel->type;
 	unsigned success;
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
 #endif
 
-	switch (kind) {
+	switch (kind)
+	{
 #ifdef STARPU_USE_CUDA
-		case STARPU_CUDA_RAM:
-			event = (*async_channel).event.cuda_event;
-			CUresult cures = cudaEventQuery(event);
-
-			success = (cures == cudaSuccess);
-			if (success)
-				cudaEventDestroy(event);
-			else if (cures != cudaErrorNotReady)
-				STARPU_CUDA_REPORT_ERROR(cures);
-
-			break;
+	case STARPU_CUDA_RAM:
+		event = (*async_channel).event.cuda_event;
+		cudaError_t cures = cudaEventQuery(event);
+
+		success = (cures == cudaSuccess);
+		if (success)
+			cudaEventDestroy(event);
+		else if (cures != cudaErrorNotReady)
+			STARPU_CUDA_REPORT_ERROR(cures);
+		break;
 #endif
 #ifdef STARPU_USE_OPENCL
-      case STARPU_OPENCL_RAM:
-         {
-            cl_int event_status;
-            cl_event opencl_event = (*async_channel).event.opencl_event;
-            if (opencl_event == NULL) STARPU_ABORT();
-            cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-            if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-            success = (event_status == CL_COMPLETE);
-            break;
-         }
+	case STARPU_OPENCL_RAM:
+	{
+		cl_int event_status;
+		cl_event opencl_event = (*async_channel).event.opencl_event;
+		if (opencl_event == NULL) STARPU_ABORT();
+		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+		success = (event_status == CL_COMPLETE);
+		break;
+	}
 #endif
-		case STARPU_CPU_RAM:
-		default:
-			STARPU_ABORT();
-			success = 0;
+	case STARPU_CPU_RAM:
+	default:
+		STARPU_ABORT();
+		success = 0;
 	}
 
 	return success;

+ 18 - 16
src/datawizard/copy_driver.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,12 +33,13 @@
 #include <starpu_opencl.h>
 #endif
 
-struct starpu_data_request_s;
-struct starpu_data_replicate_s;
+struct _starpu_data_request;
+struct _starpu_data_replicate;
 
 /* this is a structure that can be queried to see whether an asynchronous
  * transfer has terminated or not */
-typedef union {
+union _starpu_async_channel_event
+{
 	int dummy;
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t cuda_event;
@@ -46,22 +47,23 @@ typedef union {
 #ifdef STARPU_USE_OPENCL
         cl_event opencl_event;
 #endif
-} starpu_async_channel_event;
+};
 
-struct starpu_async_channel {
-	starpu_async_channel_event event;
-	starpu_node_kind type;
+struct _starpu_async_channel
+{
+	union _starpu_async_channel_event event;
+	enum starpu_node_kind type;
 };
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
 
-int _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
-					struct starpu_data_replicate_s *src_replicate,
-					struct starpu_data_replicate_s *dst_replicate,
-					unsigned donotread,
-					struct starpu_data_request_s *req,
-					unsigned may_alloc);
+int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
+				    struct _starpu_data_replicate *src_replicate,
+				    struct _starpu_data_replicate *dst_replicate,
+				    unsigned donotread,
+				    struct _starpu_data_request *req,
+				    unsigned may_alloc);
 
-unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel);
-void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel);
+unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
 #endif // __COPY_DRIVER_H__

+ 256 - 98
src/datawizard/data_request.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,27 +20,29 @@
 #include <datawizard/datawizard.h>
 
 /* requests that have not been treated at all */
-static starpu_data_request_list_t data_requests[STARPU_MAXNODES];
-static pthread_cond_t data_requests_list_cond[STARPU_MAXNODES];
+static struct _starpu_data_request_list *data_requests[STARPU_MAXNODES];
+static struct _starpu_data_request_list *prefetch_requests[STARPU_MAXNODES];
 static pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
 
 /* requests that are not terminated (eg. async transfers) */
-static starpu_data_request_list_t data_requests_pending[STARPU_MAXNODES];
-static pthread_cond_t data_requests_pending_list_cond[STARPU_MAXNODES];
+static struct _starpu_data_request_list *data_requests_pending[STARPU_MAXNODES];
 static pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
 
+int starpu_memstrategy_drop_prefetch[STARPU_MAXNODES];
+
 void _starpu_init_data_request_lists(void)
 {
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
-		data_requests[i] = starpu_data_request_list_new();
-		PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
-		PTHREAD_COND_INIT(&data_requests_list_cond[i], NULL);
+		prefetch_requests[i] = _starpu_data_request_list_new();
+		data_requests[i] = _starpu_data_request_list_new();
+		_STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
+
+		data_requests_pending[i] = _starpu_data_request_list_new();
+		_STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
 
-		data_requests_pending[i] = starpu_data_request_list_new();
-		PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
-		PTHREAD_COND_INIT(&data_requests_pending_list_cond[i], NULL);
+		starpu_memstrategy_drop_prefetch[i]=0;
 	}
 }
 
@@ -49,18 +51,17 @@ void _starpu_deinit_data_request_lists(void)
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
-		PTHREAD_COND_DESTROY(&data_requests_pending_list_cond[i]);
-		PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
-		starpu_data_request_list_delete(data_requests_pending[i]);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
+		_starpu_data_request_list_delete(data_requests_pending[i]);
 
-		PTHREAD_COND_DESTROY(&data_requests_list_cond[i]);
-		PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
-		starpu_data_request_list_delete(data_requests[i]);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
+		_starpu_data_request_list_delete(data_requests[i]);
+		_starpu_data_request_list_delete(prefetch_requests[i]);
 	}
 }
 
 /* this should be called with the lock r->handle->header_lock taken */
-static void starpu_data_request_destroy(starpu_data_request_t r)
+static void starpu_data_request_destroy(struct _starpu_data_request *r)
 {
 	unsigned node;
 
@@ -71,25 +72,27 @@ static void starpu_data_request_destroy(starpu_data_request_t r)
 	{
 		node = r->src_replicate->memory_node;
 	}
-	else {
+	else
+	{
 		node = r->dst_replicate->memory_node;
 	}
 
 	STARPU_ASSERT(r->dst_replicate->request[node] == r);
 	r->dst_replicate->request[node] = NULL;
 	//fprintf(stderr, "DESTROY REQ %p (%d) refcnt %d\n", r, node, r->refcnt);
-	starpu_data_request_delete(r);
+	_starpu_data_request_delete(r);
 }
 
 /* handle->lock should already be taken !  */
-starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
-				struct starpu_data_replicate_s *src_replicate,
-				struct starpu_data_replicate_s *dst_replicate,
-				uint32_t handling_node,
-				starpu_access_mode mode,
-				unsigned ndeps)
+struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
+							 struct _starpu_data_replicate *src_replicate,
+							 struct _starpu_data_replicate *dst_replicate,
+							 uint32_t handling_node,
+							 enum starpu_access_mode mode,
+							 unsigned ndeps,
+							 unsigned is_prefetch)
 {
-	starpu_data_request_t r = starpu_data_request_new();
+	struct _starpu_data_request *r = _starpu_data_request_new();
 
 	_starpu_spin_init(&r->lock);
 
@@ -99,6 +102,7 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 	r->mode = mode;
 	r->handling_node = handling_node;
 	r->completed = 0;
+	r->prefetch = is_prefetch;
 	r->retval = -1;
 	r->ndeps = ndeps;
 	r->next_req_count = 0;
@@ -106,15 +110,20 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
 	_starpu_spin_lock(&r->lock);
 
+	/* Take a reference on the target for the request to be able to write it */
 	dst_replicate->refcnt++;
+	handle->busy_count++;
 
 	if (mode & STARPU_R)
 	{
 		unsigned src_node = src_replicate->memory_node;
 		dst_replicate->request[src_node] = r;
+		/* Take a reference on the source for the request to be able to read it */
 		src_replicate->refcnt++;
+		handle->busy_count++;
 	}
-	else {
+	else
+	{
 		unsigned dst_node = dst_replicate->memory_node;
 		dst_replicate->request[dst_node] = r;
 	}
@@ -126,14 +135,15 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 	return r;
 }
 
-int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc)
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
 {
 	int retval;
 	int do_delete = 0;
 
 	uint32_t local_node = _starpu_get_local_memory_node();
 
-	do {
+	do
+	{
 		_starpu_spin_lock(&r->lock);
 
 		if (r->completed)
@@ -147,13 +157,14 @@ int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_a
 
 		_starpu_datawizard_progress(local_node, may_alloc);
 
-	} while (1);
+	}
+	while (1);
 
 
 	retval = r->retval;
 	if (retval)
 		_STARPU_DISP("REQUEST %p COMPLETED (retval %d) !\n", r, r->retval);
-		
+
 
 	r->refcnt--;
 
@@ -162,15 +173,15 @@ int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_a
 		do_delete = 1;
 
 	_starpu_spin_unlock(&r->lock);
-	
+
 	if (do_delete)
 		starpu_data_request_destroy(r);
-	
+
 	return retval;
 }
 
 /* this is non blocking */
-void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node)
+void _starpu_post_data_request(struct _starpu_data_request *r, uint32_t handling_node)
 {
 //	_STARPU_DEBUG("POST REQUEST\n");
 
@@ -185,9 +196,12 @@ void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node)
 	}
 
 	/* insert the request in the proper list */
-	PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
-	starpu_data_request_list_push_front(data_requests[handling_node], r);
-	PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
+	if (r->prefetch)
+		_starpu_data_request_list_push_back(prefetch_requests[handling_node], r);
+	else
+		_starpu_data_request_list_push_back(data_requests[handling_node], r);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
 
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	_starpu_wake_all_blocked_workers_on_node(handling_node);
@@ -195,13 +209,13 @@ void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node)
 }
 
 /* We assume that r->lock is taken by the caller */
-void _starpu_data_request_append_callback(starpu_data_request_t r, void (*callback_func)(void *), void *callback_arg)
+void _starpu_data_request_append_callback(struct _starpu_data_request *r, void (*callback_func)(void *), void *callback_arg)
 {
 	STARPU_ASSERT(r);
 
 	if (callback_func)
 	{
-		struct callback_list *link = (struct callback_list *) malloc(sizeof(struct callback_list));
+		struct _starpu_callback_list *link = (struct _starpu_callback_list *) malloc(sizeof(struct _starpu_callback_list));
 		STARPU_ASSERT(link);
 
 		link->callback_func = callback_func;
@@ -212,22 +226,46 @@ void _starpu_data_request_append_callback(starpu_data_request_t r, void (*callba
 }
 
 /* This method is called with handle's header_lock taken */
-static void starpu_handle_data_request_completion(starpu_data_request_t r)
+static void starpu_handle_data_request_completion(struct _starpu_data_request *r)
 {
 	unsigned do_delete = 0;
-	starpu_data_handle handle = r->handle;
-	starpu_access_mode mode = r->mode;
+	starpu_data_handle_t handle = r->handle;
+	enum starpu_access_mode mode = r->mode;
 
-	struct starpu_data_replicate_s *src_replicate = r->src_replicate;
-	struct starpu_data_replicate_s *dst_replicate = r->dst_replicate;
+	struct _starpu_data_replicate *src_replicate = r->src_replicate;
+	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 
+
+#ifdef STARPU_MEMORY_STATUS
+	enum _starpu_cache_state old_src_replicate_state = src_replicate->state;
+#endif
 	_starpu_update_data_state(handle, r->dst_replicate, mode);
 
+#ifdef STARPU_MEMORY_STATUS
+	if (src_replicate->state == STARPU_INVALID)
+	{
+		if (old_src_replicate_state == STARPU_OWNER)
+			_starpu_handle_stats_invalidated(handle, src_replicate->memory_node);
+		else
+		{
+			/* XXX Currently only ex-OWNER are tagged as invalidated */
+			/* XXX Have to check all old state of every node in case a SHARED data become OWNED by the dst_replicate */
+		}
+
+	}
+	if (dst_replicate->state == STARPU_SHARED)
+		_starpu_handle_stats_loaded_shared(handle, dst_replicate->memory_node);
+	else if (dst_replicate->state == STARPU_OWNER)
+	{
+		_starpu_handle_stats_loaded_owner(handle, dst_replicate->memory_node);
+	}
+#endif
+
 #ifdef STARPU_USE_FXT
 	uint32_t src_node = src_replicate->memory_node;
 	uint32_t dst_node = dst_replicate->memory_node;
 	size_t size = _starpu_data_get_size(handle);
-	STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, r->com_id);
+	_STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, r->com_id);
 #endif
 
 	/* Once the request has been fulfilled, we may submit the requests that
@@ -235,36 +273,42 @@ static void starpu_handle_data_request_completion(starpu_data_request_t r)
 	unsigned chained_req;
 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
 	{
-		struct starpu_data_request_s *next_req = r->next_req[chained_req];
+		struct _starpu_data_request *next_req = r->next_req[chained_req];
 		STARPU_ASSERT(next_req->ndeps > 0);
 		next_req->ndeps--;
 		_starpu_post_data_request(next_req, next_req->handling_node);
 	}
 
 	r->completed = 1;
-	
-	/* Remove a reference on the destination replicate  */
+
+	/* Remove a reference on the destination replicate for the request */
 	STARPU_ASSERT(dst_replicate->refcnt > 0);
 	dst_replicate->refcnt--;
+	STARPU_ASSERT(handle->busy_count > 0);
+	handle->busy_count--;
 
 	/* In case the source was "locked" by the request too */
 	if (mode & STARPU_R)
 	{
 		STARPU_ASSERT(src_replicate->refcnt > 0);
 		src_replicate->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
 	}
 
+	_starpu_data_check_not_busy(handle);
+
 	r->refcnt--;
 
 	/* if nobody is waiting on that request, we can get rid of it */
 	if (r->refcnt == 0)
 		do_delete = 1;
-	
+
 	r->retval = 0;
 
 	/* In case there are one or multiple callbacks, we execute them now. */
-	struct callback_list *callbacks = r->callbacks;
-	
+	struct _starpu_callback_list *callbacks = r->callbacks;
+
 	_starpu_spin_unlock(&r->lock);
 
 	if (do_delete)
@@ -278,24 +322,24 @@ static void starpu_handle_data_request_completion(starpu_data_request_t r)
 	{
 		callbacks->callback_func(callbacks->callback_arg);
 
-		struct callback_list *next = callbacks->next;
+		struct _starpu_callback_list *next = callbacks->next;
 		free(callbacks);
 		callbacks = next;
 	}
 }
 
 /* TODO : accounting to see how much time was spent working for other people ... */
-static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_alloc)
+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc)
 {
-	starpu_data_handle handle = r->handle;
+	starpu_data_handle_t handle = r->handle;
 
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&r->lock);
 
-	struct starpu_data_replicate_s *src_replicate = r->src_replicate;
-	struct starpu_data_replicate_s *dst_replicate = r->dst_replicate;
+	struct _starpu_data_replicate *src_replicate = r->src_replicate;
+	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 
-	starpu_access_mode r_mode = r->mode;
+	enum starpu_access_mode r_mode = r->mode;
 
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
@@ -307,7 +351,7 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 	/* the header of the data must be locked by the worker that submitted the request */
 
 	r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
-			dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
+						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
 
 	if (r->retval == -ENOMEM)
 	{
@@ -325,9 +369,9 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 		 * requests in the meantime. */
 		_starpu_spin_unlock(&handle->header_lock);
 
-		PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
-		starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
-		PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
+		_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
+		_starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
 
 		return -EAGAIN;
 	}
@@ -341,17 +385,18 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 
 void _starpu_handle_node_data_requests(uint32_t src_node, unsigned may_alloc)
 {
-	starpu_data_request_t r;
+	struct _starpu_data_request *r;
+	struct _starpu_data_request_list *new_data_requests;
 
 	/* take all the entries from the request list */
-        PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+        _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
 
-	starpu_data_request_list_t local_list = data_requests[src_node];
+	struct _starpu_data_request_list *local_list = data_requests[src_node];
 
-	if (starpu_data_request_list_empty(local_list))
+	if (_starpu_data_request_list_empty(local_list))
 	{
 		/* there is no request */
-                PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+                _STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 
 		return;
 	}
@@ -359,83 +404,161 @@ void _starpu_handle_node_data_requests(uint32_t src_node, unsigned may_alloc)
 	/* There is an entry: we create a new empty list to replace the list of
 	 * requests, and we handle the request(s) one by one in the former
 	 * list, without concurrency issues.*/
-	data_requests[src_node] = starpu_data_request_list_new();
+	data_requests[src_node] = _starpu_data_request_list_new();
 
-	PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+
+	new_data_requests = _starpu_data_request_list_new();
 
 	/* for all entries of the list */
-	while (!starpu_data_request_list_empty(local_list))
+	while (!_starpu_data_request_list_empty(local_list))
 	{
                 int res;
 
-		r = starpu_data_request_list_pop_back(local_list);
+		r = _starpu_data_request_list_pop_front(local_list);
 
 		res = starpu_handle_data_request(r, may_alloc);
 		if (res == -ENOMEM)
 		{
-                        PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
-			starpu_data_request_list_push_front(data_requests[src_node], r);
-			PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+			_starpu_data_request_list_push_back(new_data_requests, r);
 		}
+	}
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+	_starpu_data_request_list_push_list_front(new_data_requests, data_requests[src_node]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+
+	_starpu_data_request_list_delete(new_data_requests);
+	_starpu_data_request_list_delete(local_list);
+}
+
+void _starpu_handle_node_prefetch_requests(uint32_t src_node, unsigned may_alloc)
+{
+	starpu_memstrategy_drop_prefetch[src_node]=0;
+
+	struct _starpu_data_request *r;
+	struct _starpu_data_request_list *new_data_requests;
+	struct _starpu_data_request_list *new_prefetch_requests;
 
-		/* wake the requesting worker up */
-		// if we do not progress ..
-		// pthread_cond_broadcast(&data_requests_list_cond[src_node]);
+	/* take all the entries from the request list */
+        _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+
+	struct _starpu_data_request_list *local_list = prefetch_requests[src_node];
+
+	if (_starpu_data_request_list_empty(local_list))
+	{
+		/* there is no request */
+                _STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+		return;
 	}
 
-	starpu_data_request_list_delete(local_list);
+	/* There is an entry: we create a new empty list to replace the list of
+	 * requests, and we handle the request(s) one by one in the former
+	 * list, without concurrency issues.*/
+	prefetch_requests[src_node] = _starpu_data_request_list_new();
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+
+	new_data_requests = _starpu_data_request_list_new();
+	new_prefetch_requests = _starpu_data_request_list_new();
+
+	/* for all entries of the list */
+	while (!_starpu_data_request_list_empty(local_list))
+	{
+                int res;
+
+		r = _starpu_data_request_list_pop_front(local_list);
+
+		res = starpu_handle_data_request(r, may_alloc);
+		if (res == -ENOMEM )
+		{
+			starpu_memstrategy_drop_prefetch[src_node]=1;
+			if (r->prefetch)
+				_starpu_data_request_list_push_back(new_prefetch_requests, r);
+			else
+			{
+				/* Prefetch request promoted while in tmp list*/
+				_starpu_data_request_list_push_back(new_data_requests, r);
+			}
+			break;
+		}
+	}
+
+	while(!_starpu_data_request_list_empty(local_list) && starpu_memstrategy_drop_prefetch[src_node])
+	{
+		r = _starpu_data_request_list_pop_front(local_list);
+		if (r->prefetch)
+			_starpu_data_request_list_push_back(new_prefetch_requests, r);
+		else
+			_starpu_data_request_list_push_back(new_data_requests, r);
+	}
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+	_starpu_data_request_list_push_list_front(new_data_requests, data_requests[src_node]);
+	_starpu_data_request_list_push_list_front(new_prefetch_requests, prefetch_requests[src_node]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+
+	_starpu_data_request_list_delete(new_data_requests);
+	_starpu_data_request_list_delete(new_prefetch_requests);
+	_starpu_data_request_list_delete(local_list);
 }
 
 static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force)
 {
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
+//
+	struct _starpu_data_request_list *new_data_requests_pending = _starpu_data_request_list_new();
 
-	PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
 
 	/* for all entries of the list */
-	starpu_data_request_list_t local_list = data_requests_pending[src_node];
-	data_requests_pending[src_node] = starpu_data_request_list_new();
+	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];
+	data_requests_pending[src_node] = _starpu_data_request_list_new();
 
-	PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 
-	while (!starpu_data_request_list_empty(local_list))
+	while (!_starpu_data_request_list_empty(local_list))
 	{
-		starpu_data_request_t r;
-		r = starpu_data_request_list_pop_back(local_list);
+		struct _starpu_data_request *r;
+		r = _starpu_data_request_list_pop_front(local_list);
+
+		starpu_data_handle_t handle = r->handle;
 
-		starpu_data_handle handle = r->handle;
-		
 		_starpu_spin_lock(&handle->header_lock);
-	
+
 		_starpu_spin_lock(&r->lock);
-	
+
 		/* wait until the transfer is terminated */
 		if (force)
 		{
 			_starpu_driver_wait_request_completion(&r->async_channel);
 			starpu_handle_data_request_completion(r);
 		}
-		else {
+		else
+		{
 			if (_starpu_driver_test_request_completion(&r->async_channel))
 			{
 				/* The request was completed */
 				starpu_handle_data_request_completion(r);
 			}
-			else {
+			else
+			{
 				/* The request was not completed, so we put it
 				 * back again on the list of pending requests
 				 * so that it can be handled later on. */
 				_starpu_spin_unlock(&r->lock);
 				_starpu_spin_unlock(&handle->header_lock);
 
-				PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
-				starpu_data_request_list_push_front(data_requests_pending[src_node], r);
-				PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+				_starpu_data_request_list_push_back(new_data_requests_pending, r);
 			}
 		}
 	}
+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+	_starpu_data_request_list_push_list_back(data_requests_pending[src_node], new_data_requests_pending);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 
-	starpu_data_request_list_delete(local_list);
+	_starpu_data_request_list_delete(local_list);
+	_starpu_data_request_list_delete(new_data_requests_pending);
 }
 
 void _starpu_handle_pending_node_data_requests(uint32_t src_node)
@@ -451,8 +574,43 @@ void _starpu_handle_all_pending_node_data_requests(uint32_t src_node)
 int _starpu_check_that_no_data_request_exists(uint32_t node)
 {
 	/* XXX lock that !!! that's a quick'n'dirty test */
-	int no_request = starpu_data_request_list_empty(data_requests[node]);
-	int no_pending = starpu_data_request_list_empty(data_requests_pending[node]);
+	int no_request = _starpu_data_request_list_empty(data_requests[node]);
+	int no_pending = _starpu_data_request_list_empty(data_requests_pending[node]);
 
 	return (no_request && no_pending);
 }
+
+
+void _starpu_update_prefetch_status(struct _starpu_data_request *r)
+{
+	STARPU_ASSERT(r->prefetch > 0);
+	r->prefetch=0;
+
+	/* We have to promote chained_request too! */
+	unsigned chained_req;
+	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
+	{
+		struct _starpu_data_request *next_req = r->next_req[chained_req];
+		if (next_req->prefetch)
+			_starpu_update_prefetch_status(next_req);
+	}
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
+
+	/* The request can be in a different list (handling request or the temp list)
+	 * we have to check that it is really in the prefetch list. */
+	struct _starpu_data_request *r_iter;
+	for (r_iter = _starpu_data_request_list_begin(prefetch_requests[r->handling_node]);
+	     r_iter != _starpu_data_request_list_end(prefetch_requests[r->handling_node]);
+	     r_iter = _starpu_data_request_list_next(r_iter))
+	{
+
+		if (r==r_iter)
+		{
+			_starpu_data_request_list_erase(prefetch_requests[r->handling_node],r);
+			_starpu_data_request_list_push_front(data_requests[r->handling_node],r);
+			break;
+		}
+	}
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node]);
+}

+ 34 - 28
src/datawizard/data_request.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,29 +24,31 @@
 #include <common/list.h>
 #include <common/starpu_spinlock.h>
 
-struct starpu_data_replicate_s;
+struct _starpu_data_replicate;
 
-struct callback_list {
+struct _starpu_callback_list
+{
 	void (*callback_func)(void *);
 	void *callback_arg;
-	struct callback_list *next;
+	struct _starpu_callback_list *next;
 };
 
-LIST_TYPE(starpu_data_request,
-	starpu_spinlock_t lock;
+LIST_TYPE(_starpu_data_request,
+	struct _starpu_spinlock lock;
 	unsigned refcnt;
 
-	starpu_data_handle handle;
-	struct starpu_data_replicate_s *src_replicate;
-	struct starpu_data_replicate_s *dst_replicate;
+	starpu_data_handle_t handle;
+	struct _starpu_data_replicate *src_replicate;
+	struct _starpu_data_replicate *dst_replicate;
 
 	uint32_t handling_node;
 
-	starpu_access_mode mode;
+	enum starpu_access_mode mode;
 
-	struct starpu_async_channel async_channel;
+	struct _starpu_async_channel async_channel;
 
 	unsigned completed;
+	unsigned prefetch;
 	int retval;
 
 	/* The request will not actually be submitted until there remains
@@ -54,28 +56,28 @@ LIST_TYPE(starpu_data_request,
 	unsigned ndeps;
 
 	/* in case we have a chain of request (eg. for nvidia multi-GPU) */
-	struct starpu_data_request_s *next_req[STARPU_MAXNODES];
+	struct _starpu_data_request *next_req[STARPU_MAXNODES];
 	/* who should perform the next request ? */
 	unsigned next_req_count;
 
-	struct callback_list *callbacks;
+	struct _starpu_callback_list *callbacks;
 
 #ifdef STARPU_USE_FXT
 	unsigned com_id;
 #endif
-);
+)
 
 /* Everyone that wants to access some piece of data will post a request.
  * Not only StarPU internals, but also the application may put such requests */
-LIST_TYPE(starpu_data_requester,
+LIST_TYPE(_starpu_data_requester,
 	/* what kind of access is requested ? */
-	starpu_access_mode mode;
+	enum starpu_access_mode mode;
 
 	/* applications may also directly manipulate data */
 	unsigned is_requested_by_codelet;
 
 	/* in case this is a codelet that will do the access */
-	struct starpu_job_s *j;
+	struct _starpu_job *j;
 	unsigned buffer_index;
 
 	/* if this is more complicated ... (eg. application request) 
@@ -83,28 +85,32 @@ LIST_TYPE(starpu_data_requester,
 	 */
 	void (*ready_data_callback)(void *argcb);
 	void *argcb;
-);
+)
 
 void _starpu_init_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
-void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node);
+void _starpu_post_data_request(struct _starpu_data_request *r, uint32_t handling_node);
 void _starpu_handle_node_data_requests(uint32_t src_node, unsigned may_alloc);
+void _starpu_handle_node_prefetch_requests(uint32_t src_node, unsigned may_alloc);
 
 void _starpu_handle_pending_node_data_requests(uint32_t src_node);
 void _starpu_handle_all_pending_node_data_requests(uint32_t src_node);
 
 int _starpu_check_that_no_data_request_exists(uint32_t node);
 
-starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
-				struct starpu_data_replicate_s *src_replicate,
-				struct starpu_data_replicate_s *dst_replicate,
-				uint32_t handling_node,
-				starpu_access_mode mode,
-				unsigned ndeps);
+struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
+							 struct _starpu_data_replicate *src_replicate,
+							 struct _starpu_data_replicate *dst_replicate,
+							 uint32_t handling_node,
+							 enum starpu_access_mode mode,
+							 unsigned ndeps,
+							 unsigned is_prefetch);
 
-int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc);
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
 
-void _starpu_data_request_append_callback(starpu_data_request_t r,
-			void (*callback_func)(void *), void *callback_arg);
+void _starpu_data_request_append_callback(struct _starpu_data_request *r,
+					  void (*callback_func)(void *),
+					  void *callback_arg);
 
+void _starpu_update_prefetch_status(struct _starpu_data_request *r);
 #endif // __DATA_REQUEST_H__

+ 87 - 18
src/datawizard/datastats.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,7 @@
 
 #include <starpu.h>
 #include <datawizard/datastats.h>
+#include <datawizard/coherency.h>
 #include <common/config.h>
 
 #ifdef STARPU_DATA_STATS
@@ -94,13 +95,13 @@ void _starpu_display_alloc_cache_stats(void)
 #ifdef STARPU_DATA_STATS
 	fprintf(stderr, "Allocation cache stats:\n");
 	unsigned node;
-	for (node = 0; node < STARPU_MAXNODES; node++) 
+	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		if (alloc_cnt[node]) 
+		if (alloc_cnt[node])
 		{
 			fprintf(stderr, "memory node %d\n", node);
 			fprintf(stderr, "\ttotal alloc : %u\n", alloc_cnt[node]);
-			fprintf(stderr, "\tcached alloc: %u (%2.2f \%%)\n", 
+			fprintf(stderr, "\tcached alloc: %u (%2.2f \%%)\n",
 				alloc_cache_hit_cnt[node], (100.0f*alloc_cache_hit_cnt[node])/(alloc_cnt[node]));
 		}
 	}
@@ -109,38 +110,106 @@ void _starpu_display_alloc_cache_stats(void)
 
 /* measure the amount of data transfers between each pair of nodes */
 #ifdef STARPU_DATA_STATS
+static size_t comm_amount[STARPU_MAXNODES][STARPU_MAXNODES];
+#endif /* STARPU_DATA_STATS */
 
-static size_t comm_ammount[STARPU_MAXNODES][STARPU_MAXNODES];
+void _starpu_comm_amounts_inc(unsigned src  __attribute__ ((unused)), unsigned dst  __attribute__ ((unused)), size_t size  __attribute__ ((unused)))
+{
+#ifdef STARPU_DATA_STATS
+	comm_amount[src][dst] += size;
+#endif /* STARPU_DATA_STATS */
+}
 
 void _starpu_display_comm_amounts(void)
 {
+#ifdef STARPU_DATA_STATS
 	unsigned src, dst;
 
-	unsigned long sum = 0;
+	size_t sum = 0;
+
+	for (dst = 0; dst < STARPU_MAXNODES; dst++)
+		for (src = 0; src < STARPU_MAXNODES; src++)
+		{
+			sum += comm_amount[src][dst];
+			sum += comm_amount[dst][src];
+		}
+
+	fprintf(stderr, "\nData transfers stats:\nTOTAL transfers %f MB\n", (float)sum/1024/1024);
 
 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
-	for (src = 0; src < STARPU_MAXNODES; src++)
+		for (src = dst + 1; src < STARPU_MAXNODES; src++)
+		{
+			if (comm_amount[src][dst])
+				fprintf(stderr, "\t%d <-> %d\t%f MB\n\t\t%d -> %d\t%f MB\n\t\t%d -> %d\t%f MB\n",
+					src, dst, ((float)comm_amount[src][dst] + (float)comm_amount[dst][src])/(1024*1024),
+					src, dst, ((float)comm_amount[src][dst])/(1024*1024),
+					dst, src, ((float)comm_amount[dst][src])/(1024*1024));
+		}
+#endif
+}
+
+#ifdef STARPU_MEMORY_STATUS
+void _starpu_display_data_stats(void)
+{
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		sum += (unsigned long)comm_ammount[src][dst];
+		_starpu_display_data_stats_by_node(node);
 	}
+}
 
-	fprintf(stderr, "\nData transfers stats:\nTOTAL transfers %ld MB\n", sum/(1024*1024));
+void _starpu_display_data_handle_stats(starpu_data_handle_t handle)
+{
+	unsigned node;
 
-	for (dst = 0; dst < STARPU_MAXNODES; dst++)
-	for (src = dst + 1; src < STARPU_MAXNODES; src++)
+	fprintf(stderr, "#-----\n");
+	fprintf(stderr, "Data : %p\n", handle);
+	fprintf(stderr, "Size : %d\n", (int)handle->data_size);
+	fprintf(stderr, "\n");
+
+	fprintf(stderr, "#--\n");
+	fprintf(stderr, "Data access stats\n");
+	fprintf(stderr, "/!\\ Work Underway\n");
+	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		if (comm_ammount[src][dst])
-			fprintf(stderr, "\t%d <-> %d\t%ld MB\n\t\t%d -> %d\t%ld MB\n\t\t%d -> %d\t%ld MB\n",
-				src, dst, ((unsigned long)comm_ammount[src][dst] + (unsigned long)comm_ammount[dst][src])/(1024*1024),
-				src, dst, ((unsigned long)comm_ammount[src][dst])/(1024*1024),
-				dst, src, ((unsigned long)comm_ammount[dst][src])/(1024*1024));
+		if (handle->stats_direct_access[node]+handle->stats_loaded_shared[node]
+		    +handle->stats_invalidated[node]+handle->stats_loaded_owner[node])
+		{
+			fprintf(stderr, "Node #%d\n", node);
+			fprintf(stderr, "\tDirect access : %d\n", handle->stats_direct_access[node]);
+			/* XXX Not Working yet. */
+			if (handle->stats_shared_to_owner[node])
+				fprintf(stderr, "\t\tShared to Owner : %d\n", handle->stats_shared_to_owner[node]);
+			fprintf(stderr, "\tLoaded (Owner) : %d\n", handle->stats_loaded_owner[node]);
+			fprintf(stderr, "\tLoaded (Shared) : %d\n", handle->stats_loaded_shared[node]);
+			fprintf(stderr, "\tInvalidated (was Owner) : %d\n\n", handle->stats_invalidated[node]);
+		}
 	}
 }
 
-#else
+void _starpu_handle_stats_cache_hit(starpu_data_handle_t handle, unsigned node)
+{
+	handle->stats_direct_access[node]++;
+}
 
-void _starpu_display_comm_amounts(void)
+void _starpu_handle_stats_loaded_shared(starpu_data_handle_t handle, unsigned node)
+{
+	handle->stats_loaded_shared[node]++;
+}
+
+void _starpu_handle_stats_loaded_owner(starpu_data_handle_t handle, unsigned node)
+{
+	handle->stats_loaded_owner[node]++;
+}
+
+void _starpu_handle_stats_shared_to_owner(starpu_data_handle_t handle, unsigned node)
+{
+	handle->stats_shared_to_owner[node]++;
+}
+
+void _starpu_handle_stats_invalidated(starpu_data_handle_t handle, unsigned node)
 {
+	handle->stats_invalidated[node]++;
 }
 
 #endif

+ 12 - 2
src/datawizard/datastats.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,8 +31,18 @@ void _starpu_display_msi_stats(void);
 void _starpu_allocation_cache_hit(unsigned node __attribute__ ((unused)));
 void _starpu_data_allocation_inc_stats(unsigned node __attribute__ ((unused)));
 
-
+void _starpu_comm_amounts_inc(unsigned src, unsigned dst, size_t size);
 void _starpu_display_comm_amounts(void);
 void _starpu_display_alloc_cache_stats(void);
 
+void _starpu_display_data_stats();
+void _starpu_display_data_handle_stats(starpu_data_handle_t handle);
+
+void _starpu_handle_stats_cache_hit(starpu_data_handle_t handle, unsigned node);
+void _starpu_handle_stats_loaded_shared(starpu_data_handle_t handle, unsigned node);
+void _starpu_handle_stats_loaded_owner(starpu_data_handle_t handle, unsigned node);
+void _starpu_handle_stats_shared_to_owner(starpu_data_handle_t handle, unsigned node);
+void _starpu_handle_stats_invalidated(starpu_data_handle_t handle, unsigned node);
+
+
 #endif // __DATASTATS_H__

+ 104 - 50
src/datawizard/filters.c

@@ -1,8 +1,9 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,12 +20,12 @@
 #include <datawizard/filters.h>
 #include <datawizard/footprint.h>
 
-static void starpu_data_create_children(starpu_data_handle handle, unsigned nchildren, struct starpu_data_filter *f);
+static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f);
 
 /*
  * This function applies a data filter on all the elements of a partition
  */
-static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter *f)
+static void map_filter(starpu_data_handle_t root_handle, struct starpu_data_filter *f)
 {
 	/* we need to apply the data filter on all leaf of the tree */
 	if (root_handle->nchildren == 0)
@@ -32,7 +33,8 @@ static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter
 		/* this is a leaf */
 		starpu_data_partition(root_handle, f);
 	}
-	else {
+	else
+	{
 		/* try to apply the data filter recursively */
 		unsigned child;
 		for (child = 0; child < root_handle->nchildren; child++)
@@ -41,7 +43,7 @@ static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter
 		}
 	}
 }
-void starpu_data_vmap_filters(starpu_data_handle root_handle, unsigned nfilters, va_list pa)
+void starpu_data_vmap_filters(starpu_data_handle_t root_handle, unsigned nfilters, va_list pa)
 {
 	unsigned i;
 	for (i = 0; i < nfilters; i++)
@@ -55,7 +57,7 @@ void starpu_data_vmap_filters(starpu_data_handle root_handle, unsigned nfilters,
 	}
 }
 
-void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters, ...)
+void starpu_data_map_filters(starpu_data_handle_t root_handle, unsigned nfilters, ...)
 {
 	va_list pa;
 	va_start(pa, nfilters);
@@ -63,12 +65,12 @@ void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters,
 	va_end(pa);
 }
 
-int starpu_data_get_nb_children(starpu_data_handle handle)
+int starpu_data_get_nb_children(starpu_data_handle_t handle)
 {
         return handle->nchildren;
 }
 
-starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i)
+starpu_data_handle_t starpu_data_get_child(starpu_data_handle_t handle, unsigned i)
 {
 	STARPU_ASSERT(i < handle->nchildren);
 
@@ -76,25 +78,25 @@ starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i)
 }
 
 /*
- * example starpu_data_get_sub_data(starpu_data_handle root_handle, 3, 42, 0, 1);
+ * example starpu_data_get_sub_data(starpu_data_handle_t root_handle, 3, 42, 0, 1);
  */
-starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_handle, unsigned depth, ... )
+starpu_data_handle_t starpu_data_get_sub_data(starpu_data_handle_t root_handle, unsigned depth, ... )
 {
 	va_list pa;
 	va_start(pa, depth);
-	starpu_data_handle handle = starpu_data_vget_sub_data(root_handle, depth, pa);
+	starpu_data_handle_t handle = starpu_data_vget_sub_data(root_handle, depth, pa);
 	va_end(pa);
 
 	return handle;
 }
 
-starpu_data_handle starpu_data_vget_sub_data(starpu_data_handle root_handle, unsigned depth, va_list pa )
+starpu_data_handle_t starpu_data_vget_sub_data(starpu_data_handle_t root_handle, unsigned depth, va_list pa )
 {
 	STARPU_ASSERT(root_handle);
-	starpu_data_handle current_handle = root_handle;
+	starpu_data_handle_t current_handle = root_handle;
 
 	/* the variable number of argument must correlate the depth in the tree */
-	unsigned i; 
+	unsigned i;
 	for (i = 0; i < depth; i++)
 	{
 		unsigned next_child;
@@ -108,16 +110,16 @@ starpu_data_handle starpu_data_vget_sub_data(starpu_data_handle root_handle, uns
 	return current_handle;
 }
 
-void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data_filter *f)
+void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
 {
 	unsigned nparts;
 	unsigned i;
+	unsigned node;
 
 	/* first take care to properly lock the data header */
 	_starpu_spin_lock(&initial_handle->header_lock);
 
-	/* there should not be mutiple filters applied on the same data */
-	STARPU_ASSERT(initial_handle->nchildren == 0);
+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data");
 
 	/* how many parts ? */
 	if (f->get_nchildren)
@@ -132,9 +134,21 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
 	unsigned nworkers = starpu_worker_get_count();
 
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		if (initial_handle->per_node[node].state != STARPU_INVALID)
+			break;
+	}
+	if (node == STARPU_MAXNODES) {
+		/* This is lazy allocation, allocate it now in main RAM, so as
+		 * to have somewhere to gather pieces later */
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[0], 0);
+		STARPU_ASSERT(!ret);
+	}
+
 	for (i = 0; i < nparts; i++)
 	{
-		starpu_data_handle child =
+		starpu_data_handle_t child =
 			starpu_data_get_child(initial_handle, i);
 
 		STARPU_ASSERT(child);
@@ -152,15 +166,19 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 		child->is_readonly = initial_handle->is_readonly;
 
 		/* initialize the chunk lock */
-		child->req_list = starpu_data_requester_list_new();
-		child->reduction_req_list = starpu_data_requester_list_new();
+		child->req_list = _starpu_data_requester_list_new();
+		child->reduction_req_list = _starpu_data_requester_list_new();
 		child->refcnt = 0;
+		child->busy_count = 0;
+		child->busy_waiting = 0;
+		_STARPU_PTHREAD_MUTEX_INIT(&child->busy_mutex, NULL);
+		_STARPU_PTHREAD_COND_INIT(&child->busy_cond, NULL);
 		child->reduction_refcnt = 0;
 		_starpu_spin_init(&child->header_lock);
 
 		child->sequential_consistency = initial_handle->sequential_consistency;
 
-		PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
+		_STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
 		child->last_submitted_mode = STARPU_R;
 		child->last_submitted_writer = NULL;
 		child->last_submitted_readers = NULL;
@@ -178,11 +196,10 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 		child->last_submitted_ghost_readers_id = NULL;
 #endif
 
-		unsigned node;
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
-			struct starpu_data_replicate_s *initial_replicate; 
-			struct starpu_data_replicate_s *child_replicate;
+			struct _starpu_data_replicate *initial_replicate;
+			struct _starpu_data_replicate *child_replicate;
 
 			initial_replicate = &initial_handle->per_node[node];
 			child_replicate = &child->per_node[node];
@@ -193,7 +210,7 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 			child_replicate->refcnt = 0;
 			child_replicate->memory_node = node;
 			child_replicate->relaxed_coherency = 0;
-			
+
 			/* update the interface */
 			void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node);
 			void *child_interface = starpu_data_get_interface_on_node(child, node);
@@ -204,9 +221,9 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 		unsigned worker;
 		for (worker = 0; worker < nworkers; worker++)
 		{
-			struct starpu_data_replicate_s *child_replicate;
+			struct _starpu_data_replicate *child_replicate;
 			child_replicate = &child->per_worker[worker];
-			
+
 			child_replicate->state = STARPU_INVALID;
 			child_replicate->allocated = 0;
 			child_replicate->automatically_allocated = 0;
@@ -242,39 +259,72 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 	_starpu_spin_unlock(&initial_handle->header_lock);
 }
 
-void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_node)
+static
+void _starpu_empty_codelet_function(void *buffers[], void *args)
+{
+	(void) buffers; // unused;
+	(void) args; // unused;
+}
+
+void starpu_data_unpartition(starpu_data_handle_t root_handle, uint32_t gathering_node)
 {
 	unsigned child;
 	unsigned node;
 
 	_starpu_spin_lock(&root_handle->header_lock);
 
+	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data is not partitioned");
+
 	/* first take all the children lock (in order !) */
 	for (child = 0; child < root_handle->nchildren; child++)
 	{
-		struct starpu_data_state_t *child_handle = &root_handle->children[child];
+		struct _starpu_data_state *child_handle = &root_handle->children[child];
 
 		/* make sure the intermediate children is unpartitionned as well */
 		if (child_handle->nchildren > 0)
 			starpu_data_unpartition(child_handle, gathering_node);
 
+		/* If this is a multiformat handle, we must convert the data now */
+#ifdef STARPU_DEVEL
+#warning TODO: _starpu_fetch_data_on_node should be doing it
+#endif
+		if (_starpu_data_is_multiformat_handle(child_handle) &&
+			starpu_node_get_kind(child_handle->mf_node) != STARPU_CPU_RAM)
+		{
+			struct starpu_codelet cl =
+			{
+				.where = STARPU_CPU,
+				.cpu_funcs = { _starpu_empty_codelet_function, NULL },
+				.modes = { STARPU_RW },
+				.nbuffers = 1
+			};
+			struct starpu_task *task = starpu_task_create();
+			task->handles[0] = child_handle;
+			task->cl = &cl;
+			task->synchronous = 1;
+			if (starpu_task_submit(task) != 0)
+				_STARPU_ERROR("Could not submit the conversion task while unpartitionning\n");
+		}
+
 		int ret;
-		ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, NULL, NULL);
-		/* for now we pretend that the RAM is almost unlimited and that gathering 
+		ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, 0, NULL, NULL);
+		/* for now we pretend that the RAM is almost unlimited and that gathering
 		 * data should be possible from the node that does the unpartionning ... we
 		 * don't want to have the programming deal with memory shortage at that time,
 		 * really */
-		STARPU_ASSERT(ret == 0); 
+		STARPU_ASSERT(ret == 0);
+
+		_starpu_spin_lock(&child_handle->header_lock);
 
 		_starpu_data_free_interfaces(&root_handle->children[child]);
-		starpu_data_requester_list_delete(child_handle->req_list);
-		starpu_data_requester_list_delete(child_handle->reduction_req_list);
+		_starpu_data_requester_list_delete(child_handle->req_list);
+		_starpu_data_requester_list_delete(child_handle->reduction_req_list);
 	}
 
 	/* the gathering_node should now have a valid copy of all the children.
 	 * For all nodes, if the node had all copies and none was locally
 	 * allocated then the data is still valid there, else, it's invalidated
-	 * for the gathering node, if we have some locally allocated data, we 
+	 * for the gathering node, if we have some locally allocated data, we
 	 * copy all the children (XXX this should not happen so we just do not
 	 * do anything since this is transparent ?) */
 	unsigned still_valid[STARPU_MAXNODES];
@@ -293,11 +343,12 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 
 		for (child = 0; child < root_handle->nchildren; child++)
 		{
-			struct starpu_data_replicate_s *local = &root_handle->children[child].per_node[node];
+			struct _starpu_data_replicate *local = &root_handle->children[child].per_node[node];
 
-			if (local->state == STARPU_INVALID) {
+			if (local->state == STARPU_INVALID)
+			{
 				/* One of the bits is missing */
-				isvalid = 0; 
+				isvalid = 0;
 			}
 
 			if (local->allocated && local->automatically_allocated)
@@ -323,16 +374,17 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 	/* either shared or owned */
 	STARPU_ASSERT(nvalids > 0);
 
-	starpu_cache_state newstate = (nvalids == 1)?STARPU_OWNER:STARPU_SHARED;
+	enum _starpu_cache_state newstate = (nvalids == 1)?STARPU_OWNER:STARPU_SHARED;
 
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		root_handle->per_node[node].state = 
+		root_handle->per_node[node].state =
 			still_valid[node]?newstate:STARPU_INVALID;
 	}
 
 	/* there is no child anymore */
-	//free(root_handle->children);
+	free(root_handle->children);
+	root_handle->children = NULL;
 	root_handle->nchildren = 0;
 
 	/* now the parent may be used again so we release the lock */
@@ -340,9 +392,9 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 }
 
 /* each child may have his own interface type */
-static void starpu_data_create_children(starpu_data_handle handle, unsigned nchildren, struct starpu_data_filter *f)
+static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f)
 {
-	handle->children = (struct starpu_data_state_t *) calloc(nchildren, sizeof(struct starpu_data_state_t));
+	handle->children = (struct _starpu_data_state *) calloc(nchildren, sizeof(struct _starpu_data_state));
 	STARPU_ASSERT(handle->children);
 
 	unsigned node;
@@ -353,16 +405,16 @@ static void starpu_data_create_children(starpu_data_handle handle, unsigned nchi
 
 	for (child = 0; child < nchildren; child++)
 	{
-		starpu_data_handle handle_child = &handle->children[child];
-		
-		struct starpu_data_interface_ops_t *ops;
-		
+		starpu_data_handle_t handle_child = &handle->children[child];
+
+		struct starpu_data_interface_ops *ops;
+
 		/* what's this child's interface ? */
 		if (f->get_child_ops)
 		  ops = f->get_child_ops(f, child);
 		else
 		  ops = handle->ops;
-		
+
 		handle_child->ops = ops;
 
 		size_t interfacesize = ops->interface_size;
@@ -381,8 +433,10 @@ static void starpu_data_create_children(starpu_data_handle handle, unsigned nchi
 			handle_child->per_worker[worker].data_interface = calloc(1, interfacesize);
 			STARPU_ASSERT(handle_child->per_worker[worker].data_interface);
 		}
+
+		handle_child->mf_node = handle->mf_node;
 	}
-	
+
 	/* this handle now has children */
 	handle->nchildren = nchildren;
 }

+ 24 - 14
src/datawizard/footprint.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,9 +16,9 @@
  */
 
 #include <datawizard/footprint.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 
-uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
 {
 	if (j->footprint_is_computed)
 		return j->footprint;
@@ -28,13 +28,21 @@ uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
 
 	struct starpu_task *task = j->task;
 
-	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
-	{
-		starpu_data_handle handle = task->buffers[buffer].handle;
-
-		uint32_t handle_footprint = _starpu_data_get_footprint(handle);
-
-		footprint = _starpu_crc32_be(handle_footprint, footprint);
+	if (model && model->per_arch[arch][nimpl].size_base) {
+		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+		footprint = starpu_crc32_be_n(&size, sizeof(size), footprint);
+	} else if (model && model->size_base) {
+		size_t size = model->size_base(task, nimpl);
+		footprint = starpu_crc32_be_n(&size, sizeof(size), footprint);
+	} else {
+		for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
+		{
+			starpu_data_handle_t handle = task->handles[buffer];
+
+			uint32_t handle_footprint = _starpu_data_get_footprint(handle);
+
+			footprint = starpu_crc32_be(handle_footprint, footprint);
+		}
 	}
 
 	j->footprint = footprint;
@@ -43,11 +51,13 @@ uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
 	return footprint;
 }
 
-uint32_t _starpu_compute_data_footprint(starpu_data_handle handle)
+uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle)
 {
-	uint32_t interfaceid = (uint32_t)starpu_get_handle_interface_id(handle);
+	uint32_t interfaceid = (uint32_t)starpu_handle_get_interface_id(handle);
+
+	STARPU_ASSERT(handle->ops->footprint);
 
 	uint32_t handle_footprint = handle->ops->footprint(handle);
 
-	return _starpu_crc32_be(handle_footprint, interfaceid);
+	return starpu_crc32_be(handle_footprint, interfaceid);
 }

+ 4 - 4
src/datawizard/footprint.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,9 +24,9 @@
 
 /* Compute the footprint that characterizes the job and cache it into the job
  * structure. */
-uint32_t _starpu_compute_buffers_footprint(struct starpu_job_s *j);
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
 
 /* Compute the footprint that characterizes the layout of the data handle. */
-uint32_t _starpu_compute_data_footprint(starpu_data_handle handle);
+uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);
 
 #endif // __FOOTPRINT_H__

+ 8 - 7
src/datawizard/interfaces/bcsr_filters.c

@@ -22,17 +22,17 @@
 
 void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nparts)
 {
-	struct starpu_bcsr_interface_s *bcsr_father = (struct starpu_bcsr_interface_s *) father_interface;
+	struct starpu_bcsr_interface *bcsr_father = (struct starpu_bcsr_interface *) father_interface;
 	/* each chunk becomes a small dense matrix */
-	starpu_matrix_interface_t *matrix_child = (starpu_matrix_interface_t *) child_interface;
-	
+	struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
+
 	size_t elemsize = bcsr_father->elemsize;
 	uint32_t firstentry = bcsr_father->firstentry;
 
 	/* size of the tiles */
 	uint32_t r = bcsr_father->r;
 	uint32_t c = bcsr_father->c;
-	
+
 	uint32_t ptr_offset = c*r*id*elemsize;
 
 	matrix_child->nx = c;
@@ -40,8 +40,9 @@ void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_inte
 	matrix_child->ld = c;
 	matrix_child->elemsize = elemsize;
 
-	if (bcsr_father->nzval) {
-	  uint8_t *nzval = (uint8_t *)(bcsr_father->nzval);
-	  matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
+	if (bcsr_father->nzval)
+	{
+		uint8_t *nzval = (uint8_t *)(bcsr_father->nzval);
+		matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
 	}
 }

+ 97 - 88
src/datawizard/interfaces/bcsr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,7 +21,7 @@
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
 #include <datawizard/filters.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #endif
 
-static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
+static const struct starpu_data_copy_methods bcsr_copy_data_methods_s =
+{
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef STARPU_USE_CUDA
@@ -60,42 +61,45 @@ static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static void register_bcsr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
 static ssize_t allocate_bcsr_buffer_on_node(void *data_interface, uint32_t dst_node);
 static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node);
-static size_t bcsr_interface_get_size(starpu_data_handle handle);
+static size_t bcsr_interface_get_size(starpu_data_handle_t handle);
 static int bcsr_compare(void *data_interface_a, void *data_interface_b);
-static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle);
+static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle);
 
 
-static struct starpu_data_interface_ops_t interface_bcsr_ops = {
+static struct starpu_data_interface_ops interface_bcsr_ops =
+{
 	.register_data_handle = register_bcsr_handle,
 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
 	.free_data_on_node = free_bcsr_buffer_on_node,
 	.copy_methods = &bcsr_copy_data_methods_s,
 	.get_size = bcsr_interface_get_size,
 	.interfaceid = STARPU_BCSR_INTERFACE_ID,
-	.interface_size = sizeof(starpu_bcsr_interface_t),
+	.interface_size = sizeof(struct starpu_bcsr_interface),
 	.footprint = footprint_bcsr_interface_crc32,
 	.compare = bcsr_compare
 };
 
-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+static void register_bcsr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
 {
-	starpu_bcsr_interface_t *bcsr_interface = (starpu_bcsr_interface_t *) data_interface;
+	struct starpu_bcsr_interface *bcsr_interface = (struct starpu_bcsr_interface *) data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		starpu_bcsr_interface_t *local_interface = (starpu_bcsr_interface_t *)
+		struct starpu_bcsr_interface *local_interface = (struct starpu_bcsr_interface *)
 			starpu_data_get_interface_on_node(handle, node);
 
-		if (node == home_node) {
+		if (node == home_node)
+		{
 			local_interface->nzval = bcsr_interface->nzval;
 			local_interface->colind = bcsr_interface->colind;
 			local_interface->rowptr = bcsr_interface->rowptr;
 		}
-		else {
+		else
+		{
 			local_interface->nzval = 0;
 			local_interface->colind = NULL;
 			local_interface->rowptr = NULL;
@@ -110,12 +114,13 @@ static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node,
 	}
 }
 
-void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
+void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind,
 		uint32_t *rowptr, uint32_t firstentry,
 		uint32_t r, uint32_t c, size_t elemsize)
 {
-	starpu_bcsr_interface_t bcsr_interface = {
+	struct starpu_bcsr_interface bcsr_interface =
+	{
 		.nzval = nzval,
 		.colind = colind,
 		.rowptr = rowptr,
@@ -130,21 +135,21 @@ void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node
 	starpu_data_register(handleptr, home_node, &bcsr_interface, &interface_bcsr_ops);
 }
 
-static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle)
+static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle)
 {
 	uint32_t hash;
 
-	hash = _starpu_crc32_be(starpu_bcsr_get_nnz(handle), 0);
-	hash = _starpu_crc32_be(starpu_bcsr_get_c(handle), hash);
-	hash = _starpu_crc32_be(starpu_bcsr_get_r(handle), hash);
+	hash = starpu_crc32_be(starpu_bcsr_get_nnz(handle), 0);
+	hash = starpu_crc32_be(starpu_bcsr_get_c(handle), hash);
+	hash = starpu_crc32_be(starpu_bcsr_get_r(handle), hash);
 
 	return hash;
 }
 
 static int bcsr_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_bcsr_interface_t *bcsr_a = (starpu_bcsr_interface_t *) data_interface_a;
-	starpu_bcsr_interface_t *bcsr_b = (starpu_bcsr_interface_t *) data_interface_b;
+	struct starpu_bcsr_interface *bcsr_a = (struct starpu_bcsr_interface *) data_interface_a;
+	struct starpu_bcsr_interface *bcsr_b = (struct starpu_bcsr_interface *) data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((bcsr_a->nnz == bcsr_b->nnz)
@@ -155,87 +160,87 @@ static int bcsr_compare(void *data_interface_a, void *data_interface_b)
 }
 
 /* offer an access to the data parameters */
-uint32_t starpu_bcsr_get_nnz(starpu_data_handle handle)
+uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)
 {
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->nnz;
 }
 
-uint32_t starpu_bcsr_get_nrow(starpu_data_handle handle)
+uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle)
 {
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->nrow;
 }
 
-uint32_t starpu_bcsr_get_firstentry(starpu_data_handle handle)
+uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle)
 {
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->firstentry;
 }
 
-uint32_t starpu_bcsr_get_r(starpu_data_handle handle)
+uint32_t starpu_bcsr_get_r(starpu_data_handle_t handle)
 {
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->r;
 }
 
-uint32_t starpu_bcsr_get_c(starpu_data_handle handle)
+uint32_t starpu_bcsr_get_c(starpu_data_handle_t handle)
 {
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->c;
 }
 
-size_t starpu_bcsr_get_elemsize(starpu_data_handle handle)
+size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle)
 {
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->elemsize;
 }
 
-uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle handle)
+uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, node);
-	
+
 	return data_interface->nzval;
 }
 
-uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle handle)
+uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 {
 	/* XXX 0 */
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->colind;
 }
 
-uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle handle)
+uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 {
 	/* XXX 0 */
-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return data_interface->rowptr;
 }
 
 
-static size_t bcsr_interface_get_size(starpu_data_handle handle)
+static size_t bcsr_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;
 
@@ -245,7 +250,7 @@ static size_t bcsr_interface_get_size(starpu_data_handle handle)
 	uint32_t c = starpu_bcsr_get_c(handle);
 	size_t elemsize = starpu_bcsr_get_elemsize(handle);
 
-	size = nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t); 
+	size = nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
 
 	return size;
 }
@@ -261,7 +266,7 @@ static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_
 	ssize_t allocated_memory;
 
 	/* we need the 3 arrays to be allocated */
-	starpu_bcsr_interface_t *bcsr_interface = (starpu_bcsr_interface_t *) data_interface_;
+	struct starpu_bcsr_interface *bcsr_interface = (struct starpu_bcsr_interface *) data_interface_;
 
 	uint32_t nnz = bcsr_interface->nnz;
 	uint32_t nrow = bcsr_interface->nrow;
@@ -270,9 +275,10 @@ static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_
 	uint32_t r = bcsr_interface->r;
 	uint32_t c = bcsr_interface->c;
 
-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			addr_nzval = (uintptr_t)malloc(nnz*r*c*elemsize);
 			if (!addr_nzval)
@@ -305,42 +311,43 @@ static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_
 #endif
 #ifdef STARPU_USE_OPENCL
 		case STARPU_OPENCL_RAM:
-                        {
-                                int ret;
-                                void *ptr;
+		{
+			int ret;
+			cl_mem ptr;
 
-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
-                                addr_nzval = (uintptr_t)ptr;
-                                if (ret) goto fail_nzval;
+			ret = starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
+			addr_nzval = (uintptr_t)ptr;
+			if (ret) goto fail_nzval;
 
-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
-                                addr_colind = ptr;
-				if (ret) goto fail_colind;
+			ret = starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
+			addr_colind = (void*) ptr;
+			if (ret) goto fail_colind;
 
-                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
-                                addr_rowptr = ptr;
-				if (ret) goto fail_rowptr;
+			ret = starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
+			addr_rowptr = (void*) ptr;
+			if (ret) goto fail_rowptr;
 
-                                break;
-                        }
+			break;
+		}
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 	/* allocation succeeded */
-	allocated_memory = 
+	allocated_memory =
 		nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
 
 	/* update the data properly in consequence */
 	bcsr_interface->nzval = addr_nzval;
 	bcsr_interface->colind = addr_colind;
 	bcsr_interface->rowptr = addr_rowptr;
-	
+
 	return allocated_memory;
 
 fail_rowptr:
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_colind);
 #ifdef STARPU_USE_CUDA
@@ -354,11 +361,12 @@ fail_rowptr:
 			break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 fail_colind:
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_nzval);
 #ifdef STARPU_USE_CUDA
@@ -372,7 +380,7 @@ fail_colind:
 			break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 fail_nzval:
@@ -383,10 +391,11 @@ fail_nzval:
 
 static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_bcsr_interface_t *bcsr_interface = (starpu_bcsr_interface_t *) data_interface;
+	struct starpu_bcsr_interface *bcsr_interface = (struct starpu_bcsr_interface *) data_interface;
 
-	starpu_node_kind kind = _starpu_get_node_kind(node);
-	switch(kind) {
+	enum starpu_node_kind kind = starpu_node_get_kind(node);
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void*)bcsr_interface->nzval);
 			free((void*)bcsr_interface->colind);
@@ -407,15 +416,15 @@ static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node)
 			break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 }
 
 #ifdef STARPU_USE_CUDA
 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
-	starpu_bcsr_interface_t *src_bcsr = src_interface;
-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
+	struct starpu_bcsr_interface *src_bcsr = src_interface;
+	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
 
 	uint32_t nnz = src_bcsr->nnz;
 	uint32_t nrow = src_bcsr->nrow;
@@ -438,7 +447,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
@@ -462,8 +471,8 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 #ifdef STARPU_USE_OPENCL
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_bcsr_interface_t *src_bcsr = src_interface;
-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
+	struct starpu_bcsr_interface *src_bcsr = src_interface;
+	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
 
 	uint32_t nnz = src_bcsr->nnz;
 	uint32_t nrow = src_bcsr->nrow;
@@ -474,27 +483,27 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, (void *)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, src_node, (void *)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, (void *)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, src_node, (void *)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, (void *)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, src_node, (void *)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
 
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_bcsr_interface_t *src_bcsr = src_interface;
-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
+	struct starpu_bcsr_interface *src_bcsr = src_interface;
+	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
 
 	uint32_t nnz = src_bcsr->nnz;
 	uint32_t nrow = src_bcsr->nrow;
@@ -505,19 +514,19 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, (cl_mem)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, src_node, (cl_mem)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, (cl_mem)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, src_node, (cl_mem)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, (cl_mem)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, src_node, (cl_mem)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
@@ -526,8 +535,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 /* as not all platform easily have a BLAS lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_bcsr_interface_t *src_bcsr = (starpu_bcsr_interface_t *) src_interface;
-	starpu_bcsr_interface_t *dst_bcsr = (starpu_bcsr_interface_t *) dst_interface;
+	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
+	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
 
 	uint32_t nnz = src_bcsr->nnz;
 	uint32_t nrow = src_bcsr->nrow;
@@ -542,7 +551,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
 	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }

+ 4 - 3
src/datawizard/interfaces/block_filters.c

@@ -21,8 +21,8 @@
 void starpu_block_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
                                     unsigned id, unsigned nparts)
 {
-        starpu_block_interface_t *block_father = (starpu_block_interface_t *) father_interface;
-        starpu_block_interface_t *block_child = (starpu_block_interface_t *) child_interface;
+        struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
+        struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
 
 	uint32_t nx = block_father->nx;
         uint32_t ny = block_father->ny;
@@ -41,7 +41,8 @@ void starpu_block_filter_func_block(void *father_interface, void *child_interfac
 	block_child->nz = nz;
 	block_child->elemsize = elemsize;
 
-	if (block_father->ptr) {
+	if (block_father->ptr)
+	{
                 block_child->ptr = block_father->ptr + offset;
                 block_child->ldy = block_father->ldy;
                 block_child->ldz = block_father->ldz;

+ 146 - 112
src/datawizard/interfaces/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,7 +21,7 @@
 #include <datawizard/copy_driver.h>
 #include <datawizard/filters.h>
 
-#include <common/hash.h>
+#include <starpu_hash.h>
 
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods block_copy_data_methods_s = {
+static const struct starpu_data_copy_methods block_copy_data_methods_s =
+{
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef STARPU_USE_CUDA
@@ -65,19 +66,20 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
 };
 
 
-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
-static void *block_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static void register_block_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
+static void *block_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
 static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node);
 static void free_block_buffer_on_node(void *data_interface, uint32_t node);
-static size_t block_interface_get_size(starpu_data_handle handle);
-static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
+static size_t block_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle);
 static int block_compare(void *data_interface_a, void *data_interface_b);
-static void display_block_interface(starpu_data_handle handle, FILE *f);
+static void display_block_interface(starpu_data_handle_t handle, FILE *f);
 #ifdef STARPU_USE_GORDON
 static int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
 #endif
 
-static struct starpu_data_interface_ops_t interface_block_ops = {
+static struct starpu_data_interface_ops interface_block_ops =
+{
 	.register_data_handle = register_block_handle,
 	.allocate_data_on_node = allocate_block_buffer_on_node,
 	.handle_to_pointer = block_handle_to_pointer,
@@ -89,13 +91,13 @@ static struct starpu_data_interface_ops_t interface_block_ops = {
 #ifdef STARPU_USE_GORDON
 	.convert_to_gordon = convert_block_to_gordon,
 #endif
-	.interfaceid = STARPU_BLOCK_INTERFACE_ID, 
-	.interface_size = sizeof(starpu_block_interface_t),
-	.display = display_block_interface
+	.interfaceid = STARPU_BLOCK_INTERFACE_ID,
+	.interface_size = sizeof(struct starpu_block_interface),
+	.display = display_block_interface,
 };
 
 #ifdef STARPU_USE_GORDON
-int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
 {
 	/* TODO */
 	STARPU_ABORT();
@@ -104,34 +106,36 @@ int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSi
 }
 #endif
 
-static void *block_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+static void *block_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
 {
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return (void*) block_interface->ptr;
 }
 
-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+static void register_block_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
 {
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *) data_interface;
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *) data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		starpu_block_interface_t *local_interface = (starpu_block_interface_t *)
+		struct starpu_block_interface *local_interface = (struct starpu_block_interface *)
 			starpu_data_get_interface_on_node(handle, node);
 
-		if (node == home_node) {
+		if (node == home_node)
+		{
 			local_interface->ptr = block_interface->ptr;
                         local_interface->dev_handle = block_interface->dev_handle;
                         local_interface->offset = block_interface->offset;
 			local_interface->ldy  = block_interface->ldy;
 			local_interface->ldz  = block_interface->ldz;
 		}
-		else {
+		else
+		{
 			local_interface->ptr = 0;
                         local_interface->dev_handle = 0;
                         local_interface->offset = 0;
@@ -147,11 +151,12 @@ static void register_block_handle(starpu_data_handle handle, uint32_t home_node,
 }
 
 /* declare a new data with the BLAS interface */
-void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_node,
+void starpu_block_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 			uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
 			uint32_t ny, uint32_t nz, size_t elemsize)
 {
-	starpu_block_interface_t block_interface = {
+	struct starpu_block_interface block_interface =
+	{
 		.ptr = ptr,
                 .dev_handle = ptr,
                 .offset = 0,
@@ -166,21 +171,21 @@ void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_nod
 	starpu_data_register(handleptr, home_node, &block_interface, &interface_block_ops);
 }
 
-static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
+static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
 {
 	uint32_t hash;
 
-	hash = _starpu_crc32_be(starpu_block_get_nx(handle), 0);
-	hash = _starpu_crc32_be(starpu_block_get_ny(handle), hash);
-	hash = _starpu_crc32_be(starpu_block_get_nz(handle), hash);
+	hash = starpu_crc32_be(starpu_block_get_nx(handle), 0);
+	hash = starpu_crc32_be(starpu_block_get_ny(handle), hash);
+	hash = starpu_crc32_be(starpu_block_get_nz(handle), hash);
 
 	return hash;
 }
 
 static int block_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_block_interface_t *block_a = (starpu_block_interface_t *) data_interface_a;
-	starpu_block_interface_t *block_b = (starpu_block_interface_t *) data_interface_b;
+	struct starpu_block_interface *block_a = (struct starpu_block_interface *) data_interface_a;
+	struct starpu_block_interface *block_b = (struct starpu_block_interface *) data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((block_a->nx == block_b->nx)
@@ -189,94 +194,94 @@ static int block_compare(void *data_interface_a, void *data_interface_b)
 			&& (block_a->elemsize == block_b->elemsize));
 }
 
-static void display_block_interface(starpu_data_handle handle, FILE *f)
+static void display_block_interface(starpu_data_handle_t handle, FILE *f)
 {
-	starpu_block_interface_t *block_interface;
+	struct starpu_block_interface *block_interface;
 
-	block_interface = (starpu_block_interface_t *) starpu_data_get_interface_on_node(handle, 0);
+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, 0);
 
 	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
 }
 
-static size_t block_interface_get_size(starpu_data_handle handle)
+static size_t block_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;
-	starpu_block_interface_t *block_interface;
+	struct starpu_block_interface *block_interface;
 
-	block_interface = (starpu_block_interface_t *) starpu_data_get_interface_on_node(handle, 0);
+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, 0);
 
-	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize; 
+	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize;
 
 	return size;
 }
 
 /* offer an access to the data parameters */
-uint32_t starpu_block_get_nx(starpu_data_handle handle)
+uint32_t starpu_block_get_nx(starpu_data_handle_t handle)
 {
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return block_interface->nx;
 }
 
-uint32_t starpu_block_get_ny(starpu_data_handle handle)
+uint32_t starpu_block_get_ny(starpu_data_handle_t handle)
 {
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return block_interface->ny;
 }
 
-uint32_t starpu_block_get_nz(starpu_data_handle handle)
+uint32_t starpu_block_get_nz(starpu_data_handle_t handle)
 {
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return block_interface->nz;
 }
 
-uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
+uint32_t starpu_block_get_local_ldy(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
-	
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return block_interface->ldy;
 }
 
-uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
+uint32_t starpu_block_get_local_ldz(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return block_interface->ldz;
 }
 
-uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
+uintptr_t starpu_block_get_local_ptr(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return block_interface->ptr;
 }
 
-size_t starpu_block_get_elemsize(starpu_data_handle handle)
+size_t starpu_block_get_elemsize(starpu_data_handle_t handle)
 {
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return block_interface->elemsize;
@@ -288,26 +293,27 @@ size_t starpu_block_get_elemsize(starpu_data_handle handle)
 /* returns the size of the allocated area */
 static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	uintptr_t addr = 0;
+	uintptr_t addr = 0, handle = 0;
 	unsigned fail = 0;
 	ssize_t allocated_memory;
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
-	starpu_block_interface_t *dst_block = (starpu_block_interface_t *) data_interface_;
+	struct starpu_block_interface *dst_block = (struct starpu_block_interface *) data_interface_;
 
 	uint32_t nx = dst_block->nx;
 	uint32_t ny = dst_block->ny;
 	uint32_t nz = dst_block->nz;
 	size_t elemsize = dst_block->elemsize;
 
-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
-			addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
-			if (!addr) 
+			handle = addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
+			if (!addr)
 				fail = 1;
 
 			break;
@@ -324,6 +330,7 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 
 				fail = 1;
 			}
+			handle = addr;
 
 			break;
 #endif
@@ -331,47 +338,52 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
-                                addr = (uintptr_t)ptr;
-				if (ret) {
+				cl_mem mem;
+                                ret = starpu_opencl_allocate_memory(&mem, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
+				handle = (uintptr_t)mem;
+				if (ret)
+				{
 					fail = 1;
 				}
 				break;
 			}
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
-	if (!fail) {
+	if (!fail)
+	{
 		/* allocation succeeded */
 		allocated_memory = nx*ny*nz*elemsize;
 
 		/* update the data properly in consequence */
 		dst_block->ptr = addr;
-                dst_block->dev_handle = addr;
+		dst_block->dev_handle = handle;
                 dst_block->offset = 0;
 		dst_block->ldy = nx;
 		dst_block->ldz = nx*ny;
-	} else {
+	}
+	else
+	{
 		/* allocation failed */
 		allocated_memory = -ENOMEM;
 	}
-	
+
 	return allocated_memory;
 }
 
 static void free_block_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *) data_interface;
+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *) data_interface;
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 
-	starpu_node_kind kind = _starpu_get_node_kind(node);
-	switch(kind) {
+	enum starpu_node_kind kind = starpu_node_get_kind(node);
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void*)block_interface->ptr);
 			break;
@@ -385,19 +397,19 @@ static void free_block_buffer_on_node(void *data_interface, uint32_t node)
 #endif
 #ifdef STARPU_USE_OPENCL
                 case STARPU_OPENCL_RAM:
-                        clReleaseMemObject((void *)block_interface->ptr);
+			clReleaseMemObject((void *)block_interface->dev_handle);
                         break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 }
 
 #ifdef STARPU_USE_CUDA
 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
-	starpu_block_interface_t *src_block = src_interface;
-	starpu_block_interface_t *dst_block = dst_interface;
+	struct starpu_block_interface *src_block = src_interface;
+	struct starpu_block_interface *dst_block = dst_interface;
 
 	uint32_t nx = src_block->nx;
 	uint32_t ny = src_block->ny;
@@ -416,7 +428,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
                         if (STARPU_UNLIKELY(cures))
                                 STARPU_CUDA_REPORT_ERROR(cures);
                 }
-		else {
+		else
+		{
 			/* Are all plans contiguous */
                         cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
                                              (char *)src_block->ptr, src_block->ldz*elemsize,
@@ -425,7 +438,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
                                 STARPU_CUDA_REPORT_ERROR(cures);
                 }
 	}
-	else {
+	else
+	{
 		/* Default case: we transfer all lines one by one: ny*nz transfers */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
@@ -442,15 +456,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		}
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
 
 	return 0;
 }
 
 static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
 {
-	starpu_block_interface_t *src_block = src_interface;
-	starpu_block_interface_t *dst_block = dst_interface;
+	struct starpu_block_interface *src_block = src_interface;
+	struct starpu_block_interface *dst_block = dst_interface;
 
 	uint32_t nx = src_block->nx;
 	uint32_t ny = src_block->ny;
@@ -468,8 +482,10 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
+			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
 					nx*ny*nz*elemsize, kind, stream);
+			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
@@ -479,16 +495,20 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
 				ret = 0;
 			}
-			else {
+			else
+			{
 				ret = -EAGAIN;
 			}
-			
+
 		}
-		else {
+		else
+		{
 			/* Are all plans contiguous */
+			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
 					nx*ny*elemsize, nz, kind, stream);
+			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
@@ -499,12 +519,14 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
 				ret = 0;
 			}
-			else {
+			else
+			{
 				ret = -EAGAIN;
 			}
 		}
 	}
-	else {
+	else
+	{
 		/* Default case: we transfer all lines one by one: ny*nz transfers */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
@@ -512,9 +534,11 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
 			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
+			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
                                                   nx*elemsize, ny, kind, stream);
+			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 
 			if (STARPU_UNLIKELY(cures))
 			{
@@ -528,7 +552,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
 
 	return ret;
 
@@ -549,7 +573,7 @@ no_async_default:
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
 	return 0;
 	}
 }
@@ -583,9 +607,9 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_block_interface_t *src_block = src_interface;
-	starpu_block_interface_t *dst_block = dst_interface;
-        int err,ret;
+	struct starpu_block_interface *src_block = src_interface;
+	struct starpu_block_interface *dst_block = dst_interface;
+        int err, ret = 0;
 
 	uint32_t nx = src_block->nx;
 	uint32_t ny = src_block->ny;
@@ -597,27 +621,30 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-                        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
                                                                            src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
                                                                            dst_block->offset, (cl_event*)_event, &ret);
                         if (STARPU_UNLIKELY(err))
                                 STARPU_OPENCL_REPORT_ERROR(err);
                 }
-		else {
+		else
+		{
 			/* Are all plans contiguous */
                         /* XXX non contiguous buffers are not properly supported yet. (TODO) */
                         STARPU_ASSERT(0);
                 }
         }
-	else {
+	else
+	{
 		/* Default case: we transfer all lines one by one: ny*nz transfers */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
                         unsigned j;
-                        for(j=0 ; j<src_block->ny ; j++) {
+                        for(j=0 ; j<src_block->ny ; j++)
+			{
                                 void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
-                                err = _starpu_opencl_copy_ram_to_opencl(ptr, (cl_mem)dst_block->dev_handle,
+                                err = starpu_opencl_copy_ram_to_opencl(ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
                                                                         src_block->nx*src_block->elemsize,
                                                                         layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
                                                                         + dst_block->offset, NULL);
@@ -636,23 +663,23 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
                         //                        size_t host_row_pitch=region[0];
                         //                        size_t host_slice_pitch=region[1] * host_row_pitch;
                         //
-                        //                        _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                        //                        _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
                         //                                                               buffer_origin, host_origin, region,
                         //                                                               buffer_row_pitch, buffer_slice_pitch,
                         //                                                               host_row_pitch, host_slice_pitch, NULL);
                 }
         }
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
 
 	return ret;
 }
 
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_block_interface_t *src_block = src_interface;
-	starpu_block_interface_t *dst_block = dst_interface;
-        int err, ret;
+	struct starpu_block_interface *src_block = src_interface;
+	struct starpu_block_interface *dst_block = dst_interface;
+        int err, ret = 0;
 
 	/* We may have a contiguous buffer for the entire block, or contiguous
 	 * plans within the block, we can avoid many small transfers that way */
@@ -661,31 +688,36 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 		/* Is that a single contiguous buffer ? */
 		if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-                        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
+                        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
                                                                            src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
                                                                            src_block->offset, (cl_event*)_event, &ret);
                         if (STARPU_UNLIKELY(err))
                                 STARPU_OPENCL_REPORT_ERROR(err);
                 }
-                else {
+                else
+		{
 			/* Are all plans contiguous */
                         /* XXX non contiguous buffers are not properly supported yet. (TODO) */
                         STARPU_ASSERT(0);
                 }
         }
-	else {
+	else
+	{
 		/* Default case: we transfer all lines one by one: ny*nz transfers */
                 /* XXX non contiguous buffers are not properly supported yet. (TODO) */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
                         unsigned j;
-                        for(j=0 ; j<src_block->ny ; j++) {
+                        for(j=0 ; j<src_block->ny ; j++)
+			{
                                 void *ptr = (void *)dst_block->ptr+(layer*dst_block->ldz*dst_block->elemsize)+(j*dst_block->ldy*dst_block->elemsize);
-                                err = _starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, ptr,
+                                err = starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, src_node, ptr, dst_node,
                                                                         src_block->nx*src_block->elemsize,
                                                                         layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
                                                                         src_block->offset, NULL);
+				if (STARPU_UNLIKELY(err))
+					STARPU_OPENCL_REPORT_ERROR(err);
                         }
                         //                        const size_t buffer_origin[3] = {src_block->offset, 0, 0};
                         //                        const size_t host_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
@@ -695,14 +727,14 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
                         //                        size_t host_row_pitch=region[0];
                         //                        size_t host_slice_pitch=region[1] * host_row_pitch;
                         //
-                        //                        _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, (void *)dst_block->ptr,
+                        //                        _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, src_node, (void *)dst_block->ptr, dst_node,
                         //                                                               buffer_origin, host_origin, region,
                         //                                                               buffer_row_pitch, buffer_slice_pitch,
                         //                                                               host_row_pitch, host_slice_pitch, NULL);
                 }
         }
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
 
 	return ret;
 }
@@ -722,8 +754,8 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 /* as not all platform easily have a BLAS lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_block_interface_t *src_block = (starpu_block_interface_t *) src_interface;
-	starpu_block_interface_t *dst_block = (starpu_block_interface_t *) dst_interface;
+	struct starpu_block_interface *src_block = (struct starpu_block_interface *) src_interface;
+	struct starpu_block_interface *dst_block = (struct starpu_block_interface *) dst_interface;
 
 	uint32_t nx = dst_block->nx;
 	uint32_t ny = dst_block->ny;
@@ -740,16 +772,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
 	unsigned y, z;
 	for (z = 0; z < nz; z++)
-	for (y = 0; y < ny; y++)
 	{
-		uint32_t src_offset = (y*ldy_src + y*z*ldz_src)*elemsize;
-		uint32_t dst_offset = (y*ldy_dst + y*z*ldz_dst)*elemsize;
+		for (y = 0; y < ny; y++)
+		{
+			uint32_t src_offset = (y*ldy_src + z*ldz_src)*elemsize;
+			uint32_t dst_offset = (y*ldy_dst + z*ldz_dst)*elemsize;
 
-		memcpy((void *)(ptr_dst + dst_offset), 
-			(void *)(ptr_src + src_offset), nx*elemsize);
+			memcpy((void *)(ptr_dst + dst_offset),
+				(void *)(ptr_src + src_offset), nx*elemsize);
+		}
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
 
 	return 0;
 }

+ 13 - 12
src/datawizard/interfaces/csr_filters.c

@@ -22,8 +22,8 @@
 
 void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
-	starpu_csr_interface_t *csr_father = (starpu_csr_interface_t *) father_interface;
-	starpu_csr_interface_t *csr_child = (starpu_csr_interface_t *) child_interface;
+	struct starpu_csr_interface *csr_father = (struct starpu_csr_interface *) father_interface;
+	struct starpu_csr_interface *csr_child = (struct starpu_csr_interface *) child_interface;
 
 	uint32_t nrow = csr_father->nrow;
 	size_t elemsize = csr_father->elemsize;
@@ -35,20 +35,21 @@ void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_i
 
 	uint32_t first_index = id*chunk_size - firstentry;
 	uint32_t local_firstentry = rowptr[first_index];
-	
-	uint32_t child_nrow = 
+
+	uint32_t child_nrow =
 	  STARPU_MIN(chunk_size, nrow - id*chunk_size);
-	
-	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index]; 
-	
+
+	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
+
 	csr_child->nnz = local_nnz;
 	csr_child->nrow = child_nrow;
 	csr_child->firstentry = local_firstentry;
 	csr_child->elemsize = elemsize;
-	
-	if (csr_father->nzval) {
-	  csr_child->rowptr = &csr_father->rowptr[first_index];
-	  csr_child->colind = &csr_father->colind[local_firstentry];
-	  csr_child->nzval = csr_father->nzval + local_firstentry * elemsize;
+
+	if (csr_father->nzval)
+	{
+		csr_child->rowptr = &csr_father->rowptr[first_index];
+		csr_child->colind = &csr_father->colind[local_firstentry];
+		csr_child->nzval = csr_father->nzval + local_firstentry * elemsize;
 	}
 }

+ 136 - 96
src/datawizard/interfaces/csr_interface.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +22,7 @@
 #include <datawizard/copy_driver.h>
 #include <datawizard/filters.h>
 
-#include <common/hash.h>
+#include <starpu_hash.h>
 
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #endif
 
-static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
+static const struct starpu_data_copy_methods csr_copy_data_methods_s =
+{
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef STARPU_USE_CUDA
@@ -63,40 +64,43 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static void register_csr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
 static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node);
 static void free_csr_buffer_on_node(void *data_interface, uint32_t node);
-static size_t csr_interface_get_size(starpu_data_handle handle);
+static size_t csr_interface_get_size(starpu_data_handle_t handle);
 static int csr_compare(void *data_interface_a, void *data_interface_b);
-static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle);
+static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle);
 
-static struct starpu_data_interface_ops_t interface_csr_ops = {
+static struct starpu_data_interface_ops interface_csr_ops =
+{
 	.register_data_handle = register_csr_handle,
 	.allocate_data_on_node = allocate_csr_buffer_on_node,
 	.free_data_on_node = free_csr_buffer_on_node,
 	.copy_methods = &csr_copy_data_methods_s,
 	.get_size = csr_interface_get_size,
 	.interfaceid = STARPU_CSR_INTERFACE_ID,
-	.interface_size = sizeof(starpu_csr_interface_t),
+	.interface_size = sizeof(struct starpu_csr_interface),
 	.footprint = footprint_csr_interface_crc32,
-	.compare = csr_compare
+	.compare = csr_compare,
 };
 
-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+static void register_csr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
 {
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface;
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *) data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		starpu_csr_interface_t *local_interface = (starpu_csr_interface_t *)
+		struct starpu_csr_interface *local_interface = (struct starpu_csr_interface *)
 			starpu_data_get_interface_on_node(handle, node);
 
-		if (node == home_node) {
+		if (node == home_node)
+		{
 			local_interface->nzval = csr_interface->nzval;
 			local_interface->colind = csr_interface->colind;
 		}
-		else {
+		else
+		{
 			local_interface->nzval = 0;
 			local_interface->colind = NULL;
 		}
@@ -111,10 +115,11 @@ static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, v
 }
 
 /* declare a new data with the BLAS interface */
-void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
+void starpu_csr_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
 {
-	starpu_csr_interface_t csr_interface = {
+	struct starpu_csr_interface csr_interface =
+	{
 		.nnz = nnz,
 		.nrow = nrow,
 		.nzval = nzval,
@@ -127,15 +132,15 @@ void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 	starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
 }
 
-static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
+static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle)
 {
-	return _starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
+	return starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
 }
 
 static int csr_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_csr_interface_t *csr_a = (starpu_csr_interface_t *) data_interface_a;
-	starpu_csr_interface_t *csr_b = (starpu_csr_interface_t *) data_interface_b;
+	struct starpu_csr_interface *csr_a = (struct starpu_csr_interface *) data_interface_a;
+	struct starpu_csr_interface *csr_b = (struct starpu_csr_interface *) data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((csr_a->nnz == csr_b->nnz)
@@ -144,78 +149,78 @@ static int csr_compare(void *data_interface_a, void *data_interface_b)
 }
 
 /* offer an access to the data parameters */
-uint32_t starpu_csr_get_nnz(starpu_data_handle handle)
+uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle)
 {
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return csr_interface->nnz;
 }
 
-uint32_t starpu_csr_get_nrow(starpu_data_handle handle)
+uint32_t starpu_csr_get_nrow(starpu_data_handle_t handle)
 {
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return csr_interface->nrow;
 }
 
-uint32_t starpu_csr_get_firstentry(starpu_data_handle handle)
+uint32_t starpu_csr_get_firstentry(starpu_data_handle_t handle)
 {
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return csr_interface->firstentry;
 }
 
-size_t starpu_csr_get_elemsize(starpu_data_handle handle)
+size_t starpu_csr_get_elemsize(starpu_data_handle_t handle)
 {
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return csr_interface->elemsize;
 }
 
-uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
+uintptr_t starpu_csr_get_local_nzval(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return csr_interface->nzval;
 }
 
-uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
+uint32_t *starpu_csr_get_local_colind(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return csr_interface->colind;
 }
 
-uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
+uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return csr_interface->rowptr;
 }
 
-static size_t csr_interface_get_size(starpu_data_handle handle)
+static size_t csr_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;
 
@@ -238,15 +243,16 @@ static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_n
 	ssize_t allocated_memory;
 
 	/* we need the 3 arrays to be allocated */
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface_;
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *) data_interface_;
 
 	uint32_t nnz = csr_interface->nnz;
 	uint32_t nrow = csr_interface->nrow;
 	size_t elemsize = csr_interface->elemsize;
 
-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			addr_nzval = (uintptr_t)malloc(nnz*elemsize);
 			if (!addr_nzval)
@@ -281,46 +287,52 @@ static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_n
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
+				cl_mem ptr;
 
-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
+                                ret = starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
                                 addr_nzval = (uintptr_t)ptr;
 				if (ret) goto fail_nzval;
 
-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
-                                addr_colind = ptr;
+                                ret = starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_colind = (void*) ptr;
 				if (ret) goto fail_colind;
 
-                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
-                                addr_rowptr = ptr;
+                                ret = starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_rowptr = (void*) ptr;
 				if (ret) goto fail_rowptr;
 
 				break;
 			}
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 	/* allocation succeeded */
-	allocated_memory = 
+	allocated_memory =
 		nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
 
 	/* update the data properly in consequence */
 	csr_interface->nzval = addr_nzval;
 	csr_interface->colind = addr_colind;
 	csr_interface->rowptr = addr_rowptr;
-	
+
 	return allocated_memory;
 
 fail_rowptr:
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_colind);
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			cudaFree((void*)addr_colind);
+		{
+			cudaError_t err;
+			err = cudaFree((void*)addr_colind);
+			if (STARPU_UNLIKELY(err != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(err);
 			break;
+		}
 #endif
 #ifdef STARPU_USE_OPENCL
 		case STARPU_OPENCL_RAM:
@@ -328,17 +340,23 @@ fail_rowptr:
 			break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 fail_colind:
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_nzval);
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			cudaFree((void*)addr_nzval);
+		{
+			cudaError_t err;
+			err = cudaFree((void*)addr_nzval);
+			if (STARPU_UNLIKELY(err != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(err);
 			break;
+		}
 #endif
 #ifdef STARPU_USE_OPENCL
 		case STARPU_OPENCL_RAM:
@@ -346,7 +364,7 @@ fail_colind:
 			break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 fail_nzval:
@@ -357,10 +375,11 @@ fail_nzval:
 
 static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface;
+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *) data_interface;
 
-	starpu_node_kind kind = _starpu_get_node_kind(node);
-	switch(kind) {
+	enum starpu_node_kind kind = starpu_node_get_kind(node);
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void*)csr_interface->nzval);
 			free((void*)csr_interface->colind);
@@ -368,10 +387,19 @@ static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
 			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			cudaFree((void*)csr_interface->nzval);
-			cudaFree((void*)csr_interface->colind);
-			cudaFree((void*)csr_interface->rowptr);
+		{
+			cudaError_t err;
+			err = cudaFree((void*)csr_interface->nzval);
+			if (STARPU_UNLIKELY(err != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(err);
+			err = cudaFree((void*)csr_interface->colind);
+			if (STARPU_UNLIKELY(err != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(err);
+			err = cudaFree((void*)csr_interface->rowptr);
+			if (STARPU_UNLIKELY(err != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(err);
 			break;
+		}
 #endif
 #ifdef STARPU_USE_OPENCL
 		case STARPU_OPENCL_RAM:
@@ -381,15 +409,15 @@ static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
 			break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 }
 
 #ifdef STARPU_USE_CUDA
 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
-	starpu_csr_interface_t *src_csr = src_interface;
-	starpu_csr_interface_t *dst_csr = dst_interface;
+	struct starpu_csr_interface *src_csr = src_interface;
+	struct starpu_csr_interface *dst_csr = dst_interface;
 
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
@@ -409,15 +437,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
 
 static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
-	starpu_csr_interface_t *src_csr = src_interface;
-	starpu_csr_interface_t *dst_csr = dst_interface;
+	struct starpu_csr_interface *src_csr = src_interface;
+	struct starpu_csr_interface *dst_csr = dst_interface;
 
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
@@ -427,10 +455,12 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 
 	int synchronous_fallback = 0;
 
+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
 	if (cures)
 	{
 		synchronous_fallback = 1;
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -444,6 +474,7 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -457,17 +488,20 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
-	
+
 	if (synchronous_fallback)
 	{
-		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 		return 0;
 	}
-	else {
+	else
+	{
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		return -EAGAIN;
 	}
 }
@@ -475,15 +509,15 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef HAVE_CUDA_MEMCPY_PEER
-	starpu_csr_interface_t *src_csr = src_interface;
-	starpu_csr_interface_t *dst_csr = dst_interface;
+	struct starpu_csr_interface *src_csr = src_interface;
+	struct starpu_csr_interface *dst_csr = dst_interface;
 
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
 	size_t elemsize = src_csr->elemsize;
 
-	int src_dev = starpu_memory_node_to_devid(src_node);
-	int dst_dev = starpu_memory_node_to_devid(dst_node);
+	int src_dev = _starpu_memory_node_to_devid(src_node);
+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
 
 	cudaError_t cures;
 
@@ -499,7 +533,7 @@ static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 #else
@@ -512,8 +546,8 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef HAVE_CUDA_MEMCPY_PEER
-	starpu_csr_interface_t *src_csr = src_interface;
-	starpu_csr_interface_t *dst_csr = dst_interface;
+	struct starpu_csr_interface *src_csr = src_interface;
+	struct starpu_csr_interface *dst_csr = dst_interface;
 
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
@@ -521,15 +555,17 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 
 	cudaError_t cures;
 
-	int src_dev = starpu_memory_node_to_devid(src_node);
-	int dst_dev = starpu_memory_node_to_devid(dst_node);
+	int src_dev = _starpu_memory_node_to_devid(src_node);
+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
 
 	int synchronous_fallback = 0;
 
+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
 	if (cures)
 	{
 		synchronous_fallback = 1;
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -543,6 +579,7 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -556,17 +593,20 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
-	
+
 	if (synchronous_fallback)
 	{
-		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 		return 0;
 	}
-	else {
+	else
+	{
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		return -EAGAIN;
 	}
 #else
@@ -617,8 +657,8 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void
 #ifdef STARPU_USE_OPENCL
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_csr_interface_t *src_csr = src_interface;
-	starpu_csr_interface_t *dst_csr = dst_interface;
+	struct starpu_csr_interface *src_csr = src_interface;
+	struct starpu_csr_interface *dst_csr = dst_interface;
 
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
@@ -626,27 +666,27 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
 
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_csr_interface_t *src_csr = src_interface;
-	starpu_csr_interface_t *dst_csr = dst_interface;
+	struct starpu_csr_interface *src_csr = src_interface;
+	struct starpu_csr_interface *dst_csr = dst_interface;
 
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
@@ -654,19 +694,19 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, src_node, (cl_mem)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, src_node, (cl_mem)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, src_node, (cl_mem)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
@@ -675,8 +715,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 /* as not all platform easily have a BLAS lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_csr_interface_t *src_csr = (starpu_csr_interface_t *) src_interface;
-	starpu_csr_interface_t *dst_csr = (starpu_csr_interface_t *) dst_interface;
+	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
+	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
 
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
@@ -688,7 +728,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
 	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }

+ 179 - 75
src/datawizard/interfaces/data_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,18 +21,20 @@
 #include <core/dependencies/data_concurrency.h>
 #include <common/uthash.h>
 #include <common/starpu_spinlock.h>
+#include <core/task.h>
 
 /* Entry in the `registered_handles' hash table.  */
 struct handle_entry
 {
 	UT_hash_handle hh;
 	void *pointer;
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 };
 
 /* Hash table mapping host pointers to data handles.  */
 static struct handle_entry *registered_handles;
-static starpu_spinlock_t    registered_handles_lock;
+static struct _starpu_spinlock    registered_handles_lock;
+static int _data_interface_number = STARPU_MAX_INTERFACE_ID;
 
 void _starpu_data_interface_init()
 {
@@ -45,7 +47,8 @@ void _starpu_data_interface_shutdown()
 
 	_starpu_spin_destroy(&registered_handles_lock);
 
-	HASH_ITER(hh, registered_handles, entry, tmp) {
+	HASH_ITER(hh, registered_handles, entry, tmp)
+	{
 		HASH_DEL(registered_handles, entry);
 		free(entry);
 	}
@@ -55,7 +58,7 @@ void _starpu_data_interface_shutdown()
 
 /* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
  * some handle, the new mapping shadows the previous one.   */
-void _starpu_data_register_ram_pointer(starpu_data_handle handle, void *ptr)
+void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
 {
 	struct handle_entry *entry;
 
@@ -70,9 +73,9 @@ void _starpu_data_register_ram_pointer(starpu_data_handle handle, void *ptr)
 	_starpu_spin_unlock(&registered_handles_lock);
 }
 
-starpu_data_handle starpu_data_lookup(const void *ptr)
+starpu_data_handle_t starpu_data_lookup(const void *ptr)
 {
-	starpu_data_handle result;
+	starpu_data_handle_t result;
 
 	_starpu_spin_lock(&registered_handles_lock);
 	{
@@ -89,11 +92,16 @@ starpu_data_handle starpu_data_lookup(const void *ptr)
 	return result;
 }
 
-/* 
+int
+_starpu_data_is_multiformat_handle(starpu_data_handle_t handle)
+{
+	return handle->ops->is_multiformat;
+}
+/*
  * Start monitoring a piece of data
  */
 
-static void _starpu_register_new_data(starpu_data_handle handle,
+static void _starpu_register_new_data(starpu_data_handle_t handle,
 					uint32_t home_node, uint32_t wt_mask)
 {
 	void *ptr;
@@ -101,8 +109,12 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 	STARPU_ASSERT(handle);
 
 	/* initialize the new lock */
-	handle->req_list = starpu_data_requester_list_new();
+	handle->req_list = _starpu_data_requester_list_new();
 	handle->refcnt = 0;
+	handle->busy_count = 0;
+	handle->busy_waiting = 0;
+	_STARPU_PTHREAD_MUTEX_INIT(&handle->busy_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&handle->busy_cond, NULL);
 	_starpu_spin_init(&handle->header_lock);
 
 	/* first take care to properly lock the data */
@@ -122,7 +134,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 	handle->sequential_consistency =
 		starpu_data_get_default_sequential_consistency_flag();
 
-	PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
 	handle->last_submitted_mode = STARPU_R;
 	handle->last_submitted_writer = NULL;
 	handle->last_submitted_readers = NULL;
@@ -134,7 +146,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 	handle->init_cl = NULL;
 
 	handle->reduction_refcnt = 0;
-	handle->reduction_req_list = starpu_data_requester_list_new();
+	handle->reduction_req_list = _starpu_data_requester_list_new();
 
 #ifdef STARPU_USE_FXT
 	handle->last_submitted_ghost_writer_id_is_valid = 0;
@@ -146,6 +158,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
 	/* Store some values directly in the handle not to recompute them all
 	 * the time. */
+	STARPU_ASSERT(handle->ops->get_size);
 	handle->data_size = handle->ops->get_size(handle);
 	handle->footprint = _starpu_compute_data_footprint(handle);
 
@@ -156,20 +169,22 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		struct starpu_data_replicate_s *replicate;
+		struct _starpu_data_replicate *replicate;
 		replicate = &handle->per_node[node];
-		
+
 		replicate->memory_node = node;
 		replicate->relaxed_coherency = 0;
 		replicate->refcnt = 0;
 
-		if (node == home_node) {
+		if (node == home_node)
+		{
 			/* this is the home node with the only valid copy */
 			replicate->state = STARPU_OWNER;
 			replicate->allocated = 1;
 			replicate->automatically_allocated = 0;
 		}
-		else {
+		else
+		{
 			/* the value is not available here yet */
 			replicate->state = STARPU_INVALID;
 			replicate->allocated = 0;
@@ -180,7 +195,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 	unsigned nworkers = starpu_worker_get_count();
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		struct starpu_data_replicate_s *replicate;
+		struct _starpu_data_replicate *replicate;
 		replicate = &handle->per_worker[worker];
 		replicate->allocated = 0;
 		replicate->automatically_allocated = 0;
@@ -194,6 +209,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 			replicate->request[node] = NULL;
 		}
 
+		/* Assuming being used for SCRATCH for now, patched when entering REDUX mode */
 		replicate->relaxed_coherency = 1;
 		replicate->initialized = 0;
 		replicate->memory_node = starpu_worker_get_memory_node(worker);
@@ -212,10 +228,9 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 	}
 }
 
-static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interface_ops_t *interface_ops)
+static starpu_data_handle_t _starpu_data_handle_allocate(struct starpu_data_interface_ops *interface_ops)
 {
-	starpu_data_handle handle = (starpu_data_handle)
-		calloc(1, sizeof(struct starpu_data_state_t));
+	starpu_data_handle_t handle = (starpu_data_handle_t) calloc(1, sizeof(struct _starpu_data_state));
 
 	STARPU_ASSERT(handle);
 
@@ -226,7 +241,16 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		struct starpu_data_replicate_s *replicate;
+#ifdef STARPU_MEMORY_STATUS
+		/* Stats initilization */
+		handle->stats_direct_access[node]=0;
+		handle->stats_loaded_shared[node]=0;
+		handle->stats_shared_to_owner[node]=0;
+		handle->stats_loaded_owner[node]=0;
+		handle->stats_invalidated[node]=0;
+#endif
+
+		struct _starpu_data_replicate *replicate;
 		replicate = &handle->per_node[node];
 		/* relaxed_coherency = 0 */
 
@@ -240,7 +264,7 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 	unsigned nworkers = starpu_worker_get_count();
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		struct starpu_data_replicate_s *replicate;
+		struct _starpu_data_replicate *replicate;
 		replicate = &handle->per_worker[worker];
 
 		replicate->handle = handle;
@@ -253,24 +277,31 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 	return handle;
 }
 
-void starpu_data_register(starpu_data_handle *handleptr, uint32_t home_node,
+void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 				void *data_interface,
-				struct starpu_data_interface_ops_t *ops)
+				struct starpu_data_interface_ops *ops)
 {
-	starpu_data_handle handle =
+	starpu_data_handle_t handle =
 		_starpu_data_handle_allocate(ops);
 
 	STARPU_ASSERT(handleptr);
 	*handleptr = handle;
-
+	handle->mf_node = home_node;
 
 	/* fill the interface fields with the appropriate method */
+	STARPU_ASSERT(ops->register_data_handle);
 	ops->register_data_handle(handle, home_node, data_interface);
 
 	_starpu_register_new_data(handle, home_node, 0);
 }
 
-void *starpu_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc)
+{
+	void *local_interface = starpu_data_get_interface_on_node(handlesrc, 0);
+	starpu_data_register(handledst, -1, local_interface, handlesrc->ops);
+}
+
+void *starpu_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
 {
 	/* Check whether the operation is supported and the node has actually
 	 * been allocated.  */
@@ -283,39 +314,39 @@ void *starpu_handle_to_pointer(starpu_data_handle handle, uint32_t node)
 	return NULL;
 }
 
-void *starpu_handle_get_local_ptr(starpu_data_handle handle)
+void *starpu_handle_get_local_ptr(starpu_data_handle_t handle)
 {
 	return starpu_handle_to_pointer(handle,
 					_starpu_get_local_memory_node());
 }
 
-int starpu_data_get_rank(starpu_data_handle handle)
+int starpu_data_get_rank(starpu_data_handle_t handle)
 {
 	return handle->rank;
 }
 
-int starpu_data_set_rank(starpu_data_handle handle, int rank)
+int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
 {
         handle->rank = rank;
         return 0;
 }
 
-int starpu_data_get_tag(starpu_data_handle handle)
+int starpu_data_get_tag(starpu_data_handle_t handle)
 {
 	return handle->tag;
 }
 
-int starpu_data_set_tag(starpu_data_handle handle, int tag)
+int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 {
         handle->tag = tag;
         return 0;
 }
 
-/* 
+/*
  * Stop monitoring a piece of data
  */
 
-void _starpu_data_free_interfaces(starpu_data_handle handle)
+void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 {
 	const void *ram_ptr;
 	unsigned node;
@@ -348,40 +379,55 @@ void _starpu_data_free_interfaces(starpu_data_handle handle)
 	}
 }
 
-struct unregister_callback_arg {
+struct _starpu_unregister_callback_arg
+{
 	unsigned memory_node;
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 	unsigned terminated;
 	pthread_mutex_t mutex;
 	pthread_cond_t cond;
-}; 
+};
+
+/* Check whether we should tell starpu_data_unregister that the data handle is
+ * not busy any more.
+ * The header is supposed to be locked */
+void _starpu_data_check_not_busy(starpu_data_handle_t handle)
+{
+	if (!handle->busy_count && handle->busy_waiting)
+	{
+		_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
+		_STARPU_PTHREAD_COND_BROADCAST(&handle->busy_cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
+	}
+}
 
 static void _starpu_data_unregister_fetch_data_callback(void *_arg)
 {
 	int ret;
-	struct unregister_callback_arg *arg = (struct unregister_callback_arg *) _arg;
+	struct _starpu_unregister_callback_arg *arg = (struct _starpu_unregister_callback_arg *) _arg;
 
-	starpu_data_handle handle = arg->handle;
+	starpu_data_handle_t handle = arg->handle;
 
 	STARPU_ASSERT(handle);
 
-	struct starpu_data_replicate_s *replicate = &handle->per_node[arg->memory_node];
+	struct _starpu_data_replicate *replicate = &handle->per_node[arg->memory_node];
 
-	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, 0, NULL, NULL);
+	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, 0, 0, NULL, NULL);
 	STARPU_ASSERT(!ret);
-	
+
 	/* unlock the caller */
-	PTHREAD_MUTEX_LOCK(&arg->mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&arg->mutex);
 	arg->terminated = 1;
-	PTHREAD_COND_SIGNAL(&arg->cond);
-	PTHREAD_MUTEX_UNLOCK(&arg->mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&arg->cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&arg->mutex);
 }
 
 /* Unregister the data handle, perhaps we don't need to update the home_node
  * (in that case coherent is set to 0) */
-static void _starpu_data_unregister(starpu_data_handle handle, unsigned coherent)
+static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned coherent)
 {
 	STARPU_ASSERT(handle);
+	STARPU_ASSERT_MSG(handle->nchildren == 0, "data needs to be unpartitioned before unregistration");
 
 	if (coherent)
 	{
@@ -390,69 +436,120 @@ static void _starpu_data_unregister(starpu_data_handle handle, unsigned coherent
 
 		/* Fetch data in the home of the data to ensure we have a valid copy
 		 * where we registered it */
-		int home_node = handle->home_node; 
+		int home_node = handle->home_node;
 		if (home_node >= 0)
 		{
-			struct unregister_callback_arg arg;
+			struct _starpu_unregister_callback_arg arg;
 			arg.handle = handle;
 			arg.memory_node = (unsigned)home_node;
 			arg.terminated = 0;
-			PTHREAD_MUTEX_INIT(&arg.mutex, NULL);
-			PTHREAD_COND_INIT(&arg.cond, NULL);
-	
+			_STARPU_PTHREAD_MUTEX_INIT(&arg.mutex, NULL);
+			_STARPU_PTHREAD_COND_INIT(&arg.cond, NULL);
+
 			if (!_starpu_attempt_to_submit_data_request_from_apps(handle, STARPU_R,
 					_starpu_data_unregister_fetch_data_callback, &arg))
 			{
 				/* no one has locked this data yet, so we proceed immediately */
-				struct starpu_data_replicate_s *home_replicate = &handle->per_node[home_node];
-				int ret = _starpu_fetch_data_on_node(handle, home_replicate, STARPU_R, 0, NULL, NULL);
+				struct _starpu_data_replicate *home_replicate = &handle->per_node[home_node];
+				int ret = _starpu_fetch_data_on_node(handle, home_replicate, STARPU_R, 0, 0, NULL, NULL);
 				STARPU_ASSERT(!ret);
 			}
-			else {
-				PTHREAD_MUTEX_LOCK(&arg.mutex);
+			else
+			{
+				_STARPU_PTHREAD_MUTEX_LOCK(&arg.mutex);
 				while (!arg.terminated)
-					PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
-				PTHREAD_MUTEX_UNLOCK(&arg.mutex);
+					_STARPU_PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&arg.mutex);
 			}
+			_starpu_release_data_on_node(handle, 0, &handle->per_node[home_node]);
+		}
+
+		/* If this handle uses a multiformat interface, we may have to convert
+		 * this piece of data back into the CPU format.
+		 * XXX : This is quite hacky, could we submit a task instead ?
+		 */
+		if (_starpu_data_is_multiformat_handle(handle) &&
+			starpu_node_get_kind(handle->mf_node) != STARPU_CPU_RAM)
+		{
+			_STARPU_DEBUG("Conversion needed\n");
+			void *buffers[1];
+			struct starpu_multiformat_interface *format_interface;
+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+			struct starpu_codelet *cl;
+			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
+
+			struct starpu_multiformat_data_interface_ops *mf_ops;
+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+			switch (node_kind)
+			{
+#ifdef STARPU_USE_CUDA
+				case STARPU_CUDA_RAM:
+					cl = mf_ops->cuda_to_cpu_cl;
+					break;
+#endif
+#ifdef STARPU_USE_OPENCL
+				case STARPU_OPENCL_RAM:
+					cl = mf_ops->opencl_to_cpu_cl;
+					break;
+#endif
+				case STARPU_CPU_RAM:      /* Impossible ! */
+				case STARPU_SPU_LS:       /* Not supported */
+				default:
+					STARPU_ASSERT(0);
+			}
+			buffers[0] = format_interface;
+
+			_starpu_cl_func_t func = _starpu_task_get_cpu_nth_implementation(cl, 0);
+			STARPU_ASSERT(func);
+			func(buffers, NULL);
 		}
 	}
-	else {
+	else
+	{
 		/* Should we postpone the unregister operation ? */
 		if ((handle->refcnt > 0) && handle->lazy_unregister)
 			return;
 	}
 
+	_starpu_spin_lock(&handle->header_lock);
+	/* Tell holders of references that we're starting waiting */
+	handle->busy_waiting = 1;
+	_starpu_spin_unlock(&handle->header_lock);
+
+	/* Wait for all requests to finish (notably WT requests) */
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
+	while (handle->busy_count)
+		_STARPU_PTHREAD_COND_WAIT(&handle->busy_cond, &handle->busy_mutex);
+
+	/* Wait for finished requests to release the handle */
+	_starpu_spin_lock(&handle->header_lock);
 	_starpu_data_free_interfaces(handle);
 
 	/* Destroy the data now */
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		struct starpu_data_replicate_s *local = &handle->per_node[node];
-
-		if (local->allocated && local->automatically_allocated){
-			/* free the data copy in a lazy fashion */
-			_starpu_request_mem_chunk_removal(handle, node);
-		}
+		/* free the data copy in a lazy fashion */
+		_starpu_request_mem_chunk_removal(handle, node);
 	}
 
-	starpu_data_requester_list_delete(handle->req_list);
-	starpu_data_requester_list_delete(handle->reduction_req_list);
+	_starpu_data_requester_list_delete(handle->req_list);
+	_starpu_data_requester_list_delete(handle->reduction_req_list);
 
 	free(handle);
 }
 
-void starpu_data_unregister(starpu_data_handle handle)
+void starpu_data_unregister(starpu_data_handle_t handle)
 {
 	_starpu_data_unregister(handle, 1);
 }
 
-void starpu_data_unregister_no_coherency(starpu_data_handle handle)
+void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
 {
 	_starpu_data_unregister(handle, 0);
 }
 
-void starpu_data_invalidate(starpu_data_handle handle)
+void starpu_data_invalidate(starpu_data_handle_t handle)
 {
 	STARPU_ASSERT(handle);
 
@@ -463,14 +560,15 @@ void starpu_data_invalidate(starpu_data_handle handle)
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		struct starpu_data_replicate_s *local = &handle->per_node[node];
+		struct _starpu_data_replicate *local = &handle->per_node[node];
 
-		if (local->allocated && local->automatically_allocated){
+		if (local->allocated && local->automatically_allocated)
+		{
 			/* free the data copy in a lazy fashion */
 			_starpu_request_mem_chunk_removal(handle, node);
 		}
 
-		local->state = STARPU_INVALID; 
+		local->state = STARPU_INVALID;
 	}
 
 	_starpu_spin_unlock(&handle->header_lock);
@@ -478,12 +576,18 @@ void starpu_data_invalidate(starpu_data_handle handle)
 	starpu_data_release(handle);
 }
 
-unsigned starpu_get_handle_interface_id(starpu_data_handle handle)
+enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle)
 {
 	return handle->ops->interfaceid;
 }
 
-void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memory_node)
+void *starpu_data_get_interface_on_node(starpu_data_handle_t handle, unsigned memory_node)
 {
 	return handle->per_node[memory_node].data_interface;
 }
+
+int starpu_data_interface_get_next_id()
+{
+	_data_interface_number += 1;
+	return _data_interface_number-1;
+}

+ 6 - 3
src/datawizard/interfaces/data_interface.h

@@ -22,15 +22,18 @@
 #include <common/config.h>
 
 /* Some data interfaces or filters use this interface internally */
-extern struct starpu_data_interface_ops_t _starpu_interface_matrix_ops;
-void _starpu_data_free_interfaces(starpu_data_handle handle)
+extern struct starpu_data_interface_ops _starpu_interface_matrix_ops;
+void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 	STARPU_ATTRIBUTE_INTERNAL;
 
 extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
+extern void _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
 
-extern void _starpu_data_register_ram_pointer(starpu_data_handle handle,
+extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 						void *ptr)
 	STARPU_ATTRIBUTE_INTERNAL;
 
+extern int _starpu_data_is_multiformat_handle(starpu_data_handle_t handle);
+
 #endif // __DATA_INTERFACE_H__

+ 26 - 23
src/datawizard/interfaces/matrix_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -25,9 +25,9 @@
  */
 void starpu_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
-       starpu_matrix_interface_t *matrix_father = (starpu_matrix_interface_t *) father_interface;
-       starpu_matrix_interface_t *matrix_child = (starpu_matrix_interface_t *) child_interface;
-  
+	struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
+	struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
+
 	uint32_t nx = matrix_father->nx;
 	uint32_t ny = matrix_father->ny;
 	size_t elemsize = matrix_father->elemsize;
@@ -36,28 +36,30 @@ void starpu_block_filter_func(void *father_interface, void *child_interface, STA
 
 	size_t chunk_size = ((size_t)nx + nchunks - 1)/nchunks;
 	size_t offset = (size_t)id*chunk_size*elemsize;
-	
-	uint32_t child_nx = 
+
+	uint32_t child_nx =
 	  STARPU_MIN(chunk_size, (size_t)nx - (size_t)id*chunk_size);
-	
+
 	/* update the child's interface */
 	matrix_child->nx = child_nx;
 	matrix_child->ny = ny;
 	matrix_child->elemsize = elemsize;
-	
+
 	/* is the information on this node valid ? */
-	if (matrix_father->ptr) {
-	  matrix_child->ptr = matrix_father->ptr + offset;
-	  matrix_child->ld = matrix_father->ld;
-	  matrix_child->dev_handle = matrix_father->dev_handle;
-	  matrix_child->offset = matrix_father->offset + offset;
+	if (matrix_father->dev_handle)
+	{
+		if (matrix_father->ptr)
+			matrix_child->ptr = matrix_father->ptr + offset;
+		matrix_child->ld = matrix_father->ld;
+		matrix_child->dev_handle = matrix_father->dev_handle;
+		matrix_child->offset = matrix_father->offset + offset;
 	}
 }
 
 void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
-        starpu_matrix_interface_t *matrix_father = (starpu_matrix_interface_t *) father_interface;
-        starpu_matrix_interface_t *matrix_child = (starpu_matrix_interface_t *) child_interface;
+        struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
+        struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
 
 	uint32_t nx = matrix_father->nx;
 	uint32_t ny = matrix_father->ny;
@@ -66,7 +68,7 @@ void starpu_vertical_block_filter_func(void *father_interface, void *child_inter
 	STARPU_ASSERT(nchunks <= ny);
 
 	size_t chunk_size = ((size_t)ny + nchunks - 1)/nchunks;
-	size_t child_ny = 
+	size_t child_ny =
 	  STARPU_MIN(chunk_size, (size_t)ny - (size_t)id*chunk_size);
 
 	matrix_child->nx = nx;
@@ -74,12 +76,13 @@ void starpu_vertical_block_filter_func(void *father_interface, void *child_inter
 	matrix_child->elemsize = elemsize;
 
 	/* is the information on this node valid ? */
-	if (matrix_father->ptr) {
-	  size_t offset = 
-	    (size_t)id*chunk_size*matrix_father->ld*elemsize;
-	  matrix_child->ptr = matrix_father->ptr + offset;
-	  matrix_child->ld = matrix_father->ld;
-	  matrix_child->dev_handle = matrix_father->dev_handle;
-	  matrix_child->offset = matrix_father->offset + offset;
+	if (matrix_father->dev_handle)
+	{
+		size_t offset = (size_t)id*chunk_size*matrix_father->ld*elemsize;
+		if (matrix_father->ptr)
+			matrix_child->ptr = matrix_father->ptr + offset;
+		matrix_child->ld = matrix_father->ld;
+		matrix_child->dev_handle = matrix_father->dev_handle;
+		matrix_child->offset = matrix_father->offset + offset;
 	}
 }

+ 166 - 105
src/datawizard/interfaces/matrix_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,11 +20,14 @@
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
 #include <datawizard/filters.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
+/* If you can promise that there is no stride in your matrices, you can define this */
+// #define NO_STRIDE
+
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
@@ -32,7 +35,9 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIB
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-//static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+#ifdef NO_STRIDE
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+#endif
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
@@ -41,7 +46,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
+static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
+{
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef STARPU_USE_CUDA
@@ -50,7 +56,9 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
-//	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
+#ifdef NO_STRIDE
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
+#endif
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -64,19 +72,20 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
-static void *matrix_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static void register_matrix_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
+static void *matrix_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
 static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node);
 static void free_matrix_buffer_on_node(void *data_interface, uint32_t node);
-static size_t matrix_interface_get_size(starpu_data_handle handle);
-static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle);
+static size_t matrix_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_matrix_interface_crc32(starpu_data_handle_t handle);
 static int matrix_compare(void *data_interface_a, void *data_interface_b);
-static void display_matrix_interface(starpu_data_handle handle, FILE *f);
+static void display_matrix_interface(starpu_data_handle_t handle, FILE *f);
 #ifdef STARPU_USE_GORDON
-static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
 #endif
 
-struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
+struct starpu_data_interface_ops _starpu_interface_matrix_ops =
+{
 	.register_data_handle = register_matrix_handle,
 	.allocate_data_on_node = allocate_matrix_buffer_on_node,
 	.handle_to_pointer = matrix_handle_to_pointer,
@@ -88,13 +97,13 @@ struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
 #ifdef STARPU_USE_GORDON
 	.convert_to_gordon = convert_matrix_to_gordon,
 #endif
-	.interfaceid = STARPU_MATRIX_INTERFACE_ID, 
-	.interface_size = sizeof(starpu_matrix_interface_t),
-	.display = display_matrix_interface
+	.interfaceid = STARPU_MATRIX_INTERFACE_ID,
+	.interface_size = sizeof(struct starpu_matrix_interface),
+	.display = display_matrix_interface,
 };
 
 #ifdef STARPU_USE_GORDON
-static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
 {
 	size_t elemsize = GET_MATRIX_ELEMSIZE(interface);
 	uint32_t nx = STARPU_MATRIX_GET_NX(interface);
@@ -111,23 +120,25 @@ static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_
 }
 #endif
 
-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+static void register_matrix_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
 {
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *) data_interface;
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *) data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		starpu_matrix_interface_t *local_interface = (starpu_matrix_interface_t *)
+		struct starpu_matrix_interface *local_interface = (struct starpu_matrix_interface *)
 			starpu_data_get_interface_on_node(handle, node);
 
-		if (node == home_node) {
+		if (node == home_node)
+		{
 			local_interface->ptr = matrix_interface->ptr;
                         local_interface->dev_handle = matrix_interface->dev_handle;
                         local_interface->offset = matrix_interface->offset;
 			local_interface->ld  = matrix_interface->ld;
 		}
-		else {
+		else
+		{
 			local_interface->ptr = 0;
 			local_interface->dev_handle = 0;
 			local_interface->offset = 0;
@@ -140,11 +151,11 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 	}
 }
 
-static void *matrix_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+static void *matrix_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
 {
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return (void*) matrix_interface->ptr;
@@ -152,11 +163,12 @@ static void *matrix_handle_to_pointer(starpu_data_handle handle, uint32_t node)
 
 
 /* declare a new data with the matrix interface */
-void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_node,
+void starpu_matrix_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 			uintptr_t ptr, uint32_t ld, uint32_t nx,
 			uint32_t ny, size_t elemsize)
 {
-	starpu_matrix_interface_t matrix_interface = {
+	struct starpu_matrix_interface matrix_interface =
+	{
 		.ptr = ptr,
 		.ld = ld,
 		.nx = nx,
@@ -169,15 +181,15 @@ void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_no
 	starpu_data_register(handleptr, home_node, &matrix_interface, &_starpu_interface_matrix_ops);
 }
 
-static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle)
+static uint32_t footprint_matrix_interface_crc32(starpu_data_handle_t handle)
 {
-	return _starpu_crc32_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
+	return starpu_crc32_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
 }
 
 static int matrix_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_matrix_interface_t *matrix_a = (starpu_matrix_interface_t *) data_interface_a;
-	starpu_matrix_interface_t *matrix_b = (starpu_matrix_interface_t *) data_interface_b;
+	struct starpu_matrix_interface *matrix_a = (struct starpu_matrix_interface *) data_interface_a;
+	struct starpu_matrix_interface *matrix_b = (struct starpu_matrix_interface *) data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((matrix_a->nx == matrix_b->nx)
@@ -185,71 +197,71 @@ static int matrix_compare(void *data_interface_a, void *data_interface_b)
 			&& (matrix_a->elemsize == matrix_b->elemsize));
 }
 
-static void display_matrix_interface(starpu_data_handle handle, FILE *f)
+static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 {
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
 }
 
-static size_t matrix_interface_get_size(starpu_data_handle handle)
+static size_t matrix_interface_get_size(starpu_data_handle_t handle)
 {
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	size_t size;
-	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize; 
+	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize;
 
 	return size;
 }
 
 /* offer an access to the data parameters */
-uint32_t starpu_matrix_get_nx(starpu_data_handle handle)
+uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle)
 {
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return matrix_interface->nx;
 }
 
-uint32_t starpu_matrix_get_ny(starpu_data_handle handle)
+uint32_t starpu_matrix_get_ny(starpu_data_handle_t handle)
 {
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return matrix_interface->ny;
 }
 
-uint32_t starpu_matrix_get_local_ld(starpu_data_handle handle)
+uint32_t starpu_matrix_get_local_ld(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return matrix_interface->ld;
 }
 
-uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle handle)
+uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return matrix_interface->ptr;
 }
 
-size_t starpu_matrix_get_elemsize(starpu_data_handle handle)
+size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle)
 {
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return matrix_interface->elemsize;
@@ -260,7 +272,7 @@ size_t starpu_matrix_get_elemsize(starpu_data_handle handle)
 /* returns the size of the allocated area */
 static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	uintptr_t addr = 0;
+	uintptr_t addr = 0, handle = 0;
 	unsigned fail = 0;
 	ssize_t allocated_memory;
 
@@ -268,19 +280,20 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 	cudaError_t status;
 #endif
 
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *) data_interface_;
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *) data_interface_;
 
 	uint32_t nx = matrix_interface->nx;
 	uint32_t ny = matrix_interface->ny;
 	uint32_t ld = nx; // by default
 	size_t elemsize = matrix_interface->elemsize;
 
-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
-			addr = (uintptr_t)malloc((size_t)nx*ny*elemsize);
-			if (!addr) 
+			handle = addr = (uintptr_t)malloc((size_t)nx*ny*elemsize);
+			if (!addr)
 				fail = 1;
 
 			break;
@@ -291,9 +304,10 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 			{
 				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
 					 STARPU_CUDA_REPORT_ERROR(status);
-					
+
 				fail = 1;
 			}
+			handle = addr;
 
 			ld = nx;
 
@@ -303,52 +317,57 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*elemsize, CL_MEM_READ_WRITE);
-                                addr = (uintptr_t)ptr;
-				if (ret) {
+				cl_mem mem;
+                                ret = starpu_opencl_allocate_memory(&mem, nx*ny*elemsize, CL_MEM_READ_WRITE);
+				handle = (uintptr_t)mem;
+				if (ret)
+				{
 					fail = 1;
 				}
 				break;
 			}
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
-	if (!fail) {
+	if (!fail)
+	{
 		/* allocation succeeded */
 		allocated_memory = (size_t)nx*ny*elemsize;
 
 		/* update the data properly in consequence */
 		matrix_interface->ptr = addr;
-                matrix_interface->dev_handle = addr;
+		matrix_interface->dev_handle = handle;
                 matrix_interface->offset = 0;
 		matrix_interface->ld = ld;
-	} else {
+	}
+	else
+	{
 		/* allocation failed */
 		allocated_memory = -ENOMEM;
 	}
-	
+
 	return allocated_memory;
 }
 
 static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *) data_interface;
+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *) data_interface;
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 
-	starpu_node_kind kind = _starpu_get_node_kind(node);
-	switch(kind) {
+	enum starpu_node_kind kind = starpu_node_get_kind(node);
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void*)matrix_interface->ptr);
 			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			status = cudaFree((void*)matrix_interface->ptr);			
+			status = cudaFree((void*)matrix_interface->ptr);
 			if (STARPU_UNLIKELY(status))
 				STARPU_CUDA_REPORT_ERROR(status);
 
@@ -356,36 +375,37 @@ static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
 #endif
 #ifdef STARPU_USE_OPENCL
                 case STARPU_OPENCL_RAM:
-                        clReleaseMemObject((void *)matrix_interface->ptr);
+			clReleaseMemObject((void *)matrix_interface->dev_handle);
                         break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 }
 
 #ifdef STARPU_USE_CUDA
 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
 {
-	starpu_matrix_interface_t *src_matrix = src_interface;
-	starpu_matrix_interface_t *dst_matrix = dst_interface;
+	struct starpu_matrix_interface *src_matrix = src_interface;
+	struct starpu_matrix_interface *dst_matrix = dst_interface;
 
 	size_t elemsize = src_matrix->elemsize;
 	cudaError_t cures;
 
 #if 0
-
 	struct cudaMemcpy3DParms p;
 	memset(&p, 0, sizeof(p));
 
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * src_matrix->ny *elemsize, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * src_matrix->ny *elemsize, dst_matrix->ny);
-	p.extent = make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * elemsize, src_matrix->ny);
+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * elemsize, dst_matrix->ny);
+	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
 	p.kind = kind;
 
 	if (is_async)
 	{
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy3DAsync(&p, stream);
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -393,13 +413,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	cures = cudaMemcpy3D(&p);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
-#endif
+#else
 
 	if (is_async)
 	{
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -409,26 +431,29 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		src_matrix->nx*elemsize, src_matrix->ny, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
+#endif
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
 }
 
-/* XXX this is broken : we need to find a way to fix that ! */
-#if 0
+/* XXX this is broken : We need to properly call cudaDeviceEnablePeerAccess(), and avoid crossing NUMA nodes... */
+#ifdef NO_STRIDE
 static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
 {
-	starpu_matrix_interface_t *src_matrix = src_interface;
-	starpu_matrix_interface_t *dst_matrix = dst_interface;
+	struct starpu_matrix_interface *src_matrix = src_interface;
+	struct starpu_matrix_interface *dst_matrix = dst_interface;
 
 	size_t elemsize = src_matrix->elemsize;
 	cudaError_t cures;
 
-#if 1
-	int src_dev = starpu_memory_node_to_devid(src_node);
-	int dst_dev = starpu_memory_node_to_devid(dst_node);
+	int src_dev = _starpu_memory_node_to_devid(src_node);
+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
+
 
+#if 0
+	/* That code is not even working!! */
 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
 
 	cures = cudaSetDevice(src_dev);
@@ -455,21 +480,55 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	p.dstPtr = mem_device2;
 	p.extent = extent;
 
+	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
+	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
+	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
+	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
+	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
+	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
+	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
 	cures = cudaMemcpy3DPeer(&p);
 	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	        STARPU_CUDA_REPORT_ERROR(cures);
+#endif
+
+#if 0
+	struct cudaMemcpy3DPeerParms p;
+	memset(&p, 0, sizeof(p));
+
+	p.srcDevice = src_dev;
+	p.dstDevice = dst_dev;
+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx * elemsize, src_matrix->ny);
+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx * elemsize, dst_matrix->ny);
+	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
+
+#if 1
+	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
+	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
+	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
+	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
+	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
+	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
+	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
+#endif
 
+	cures = cudaMemcpy3DPeerAsync(&p, stream);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+	cudaThreadSynchronize();
 
-//make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
-//make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, src_matrix->nx, dst_matrix->ny);
-//make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
+	if (is_async)
+	{
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		cures = cudaMemcpy3DPeerAsync(&p, stream);
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		if (!cures)
+			return -EAGAIN;
+	}
 
-//	if (is_async)
-//	{
-//		cures = cudaMemcpy3DPeerAsync(&p, stream);
-//		if (!cures)
-//			return -EAGAIN;
-//	}
+	cures = cudaMemcpy3DPeer(&p);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
 
 #else
 	/* XXX FIXME !!*/
@@ -478,7 +537,9 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
 	if (is_async)
 	{
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -488,7 +549,7 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 		STARPU_CUDA_REPORT_ERROR(cures);
 #endif
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
 }
@@ -526,7 +587,7 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
 }
 
-#if 0
+#ifdef NO_STRIDE
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	if (src_node == dst_node)
@@ -540,41 +601,41 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_matrix_interface_t *src_matrix = src_interface;
-	starpu_matrix_interface_t *dst_matrix = dst_interface;
+	struct starpu_matrix_interface *src_matrix = src_interface;
+	struct starpu_matrix_interface *dst_matrix = dst_interface;
         int err,ret;
 
 	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
 	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
 
-	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle,
+	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
                                                            dst_matrix->offset, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return ret;
 }
 
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_matrix_interface_t *src_matrix = src_interface;
-	starpu_matrix_interface_t *dst_matrix = dst_interface;
+	struct starpu_matrix_interface *src_matrix = src_interface;
+	struct starpu_matrix_interface *dst_matrix = dst_interface;
         int err, ret;
 
 	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
 	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
 
-        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
+        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
                                                            src_matrix->offset, (cl_event*)_event, &ret);
 
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return ret;
 }
@@ -594,8 +655,8 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 /* as not all platform easily have a  lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_matrix_interface_t *src_matrix = (starpu_matrix_interface_t *) src_interface;
-	starpu_matrix_interface_t *dst_matrix = (starpu_matrix_interface_t *) dst_interface;
+	struct starpu_matrix_interface *src_matrix = (struct starpu_matrix_interface *) src_interface;
+	struct starpu_matrix_interface *dst_matrix = (struct starpu_matrix_interface *) dst_interface;
 
 	unsigned y;
 	uint32_t nx = dst_matrix->nx;
@@ -614,11 +675,11 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 		uint32_t src_offset = y*ld_src*elemsize;
 		uint32_t dst_offset = y*ld_dst*elemsize;
 
-		memcpy((void *)(ptr_dst + dst_offset), 
+		memcpy((void *)(ptr_dst + dst_offset),
 			(void *)(ptr_src + src_offset), nx*elemsize);
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
 
 	return 0;
 }

+ 724 - 0
src/datawizard/interfaces/multiformat_interface.c

@@ -0,0 +1,724 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2012  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <common/config.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy_driver.h>
+#include <datawizard/filters.h>
+#include <starpu_hash.h>
+#include <starpu_cuda.h>
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#include <core/task.h>
+
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+#ifdef STARPU_USE_CUDA
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
+#endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
+#endif
+
+static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
+{
+	.ram_to_ram = copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef STARPU_USE_CUDA
+	.ram_to_cuda = copy_ram_to_cuda,
+	.cuda_to_ram = copy_cuda_to_ram,
+	.ram_to_cuda_async = copy_ram_to_cuda_async,
+	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda = copy_cuda_to_cuda,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+	.opencl_to_opencl = copy_opencl_to_opencl,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
+	.cuda_to_spu = NULL,
+	.spu_to_ram = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu = NULL
+};
+
+static void register_multiformat_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
+static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void *multiformat_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
+static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node);
+static size_t multiformat_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle);
+static int multiformat_compare(void *data_interface_a, void *data_interface_b);
+static void display_multiformat_interface(starpu_data_handle_t handle, FILE *f);
+static uint32_t starpu_multiformat_get_nx(starpu_data_handle_t handle);
+#ifdef STARPU_USE_GORDON
+static int convert_multiformat_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
+#endif
+
+static struct starpu_multiformat_data_interface_ops*
+get_mf_ops(void *data_interface)
+{
+	struct starpu_multiformat_interface *mf;
+	mf = (struct starpu_multiformat_interface *) data_interface;
+
+	return mf->ops;
+}
+
+static struct starpu_data_interface_ops interface_multiformat_ops =
+{
+	.register_data_handle  = register_multiformat_handle,
+	.allocate_data_on_node = allocate_multiformat_buffer_on_node,
+	.handle_to_pointer     = multiformat_handle_to_pointer,
+	.free_data_on_node     = free_multiformat_buffer_on_node,
+	.copy_methods          = &multiformat_copy_data_methods_s,
+	.get_size              = multiformat_interface_get_size,
+	.footprint             = footprint_multiformat_interface_crc32,
+	.compare               = multiformat_compare,
+#ifdef STARPU_USE_GORDON
+	.convert_to_gordon     = NULL,
+#endif
+	.interfaceid           = STARPU_MULTIFORMAT_INTERFACE_ID,
+	.interface_size        = sizeof(struct starpu_multiformat_interface),
+	.display               = display_multiformat_interface,
+	.is_multiformat        = 1,
+	.get_mf_ops            = get_mf_ops
+};
+
+static void *multiformat_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+	struct starpu_multiformat_interface *multiformat_interface =
+		(struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, node);
+
+	switch(starpu_node_get_kind(node))
+	{
+		case STARPU_CPU_RAM:
+			return multiformat_interface->cpu_ptr;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			return multiformat_interface->cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			return multiformat_interface->opencl_ptr;
+#endif
+		default:
+			STARPU_ASSERT(0);
+	}
+}
+
+static void register_multiformat_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
+{
+	struct starpu_multiformat_interface *multiformat_interface;
+	multiformat_interface = (struct starpu_multiformat_interface *) data_interface;
+
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct starpu_multiformat_interface *local_interface =
+			(struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, node);
+
+		if (node == home_node)
+		{
+			local_interface->cpu_ptr    = multiformat_interface->cpu_ptr;
+#ifdef STARPU_USE_CUDA
+			local_interface->cuda_ptr   = multiformat_interface->cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
+#endif
+		}
+		else
+		{
+			local_interface->cpu_ptr    = NULL;
+#ifdef STARPU_USE_CUDA
+			local_interface->cuda_ptr   = NULL;
+#endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = NULL;
+#endif
+		}
+		local_interface->nx = multiformat_interface->nx;
+		local_interface->ops = multiformat_interface->ops;
+	}
+}
+
+void starpu_multiformat_data_register(starpu_data_handle_t *handleptr,
+				      uint32_t home_node,
+				      void *ptr,
+				      uint32_t nobjects,
+				      struct starpu_multiformat_data_interface_ops *format_ops)
+{
+#ifdef STARPU_USE_OPENCL
+	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_opencl_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->opencl_to_cpu_cl);
+#endif
+#ifdef STARPU_USE_CUDA
+	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_cuda_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->cuda_to_cpu_cl);
+#endif
+
+	struct starpu_multiformat_interface multiformat =
+	{
+		.cpu_ptr    = ptr,
+#ifdef STARPU_USE_CUDA
+		.cuda_ptr   = NULL,
+#endif
+#ifdef STARPu_USE_OPENCL
+		.opencl_ptr = NULL,
+#endif
+		.nx         = nobjects,
+		.ops        = format_ops
+	};
+
+	starpu_data_register(handleptr, home_node, &multiformat, &interface_multiformat_ops);
+}
+
+static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle)
+{
+	return starpu_crc32_be(starpu_multiformat_get_nx(handle), 0);
+}
+
+static int multiformat_compare(void *data_interface_a, void *data_interface_b)
+{
+	struct starpu_multiformat_interface *multiformat_a = (struct starpu_multiformat_interface *) data_interface_a;
+	struct starpu_multiformat_interface *multiformat_b = (struct starpu_multiformat_interface *) data_interface_b;
+
+	return ((multiformat_a->nx == multiformat_b->nx)
+			&& (multiformat_a->ops->cpu_elemsize == multiformat_b->ops->cpu_elemsize)
+#ifdef STARPU_USE_CUDA
+			&& (multiformat_a->ops->cuda_elemsize == multiformat_b->ops->cuda_elemsize)
+#endif
+#ifdef STARPU_USE_OPENCL
+			&& (multiformat_a->ops->opencl_elemsize == multiformat_b->ops->opencl_elemsize)
+#endif
+		);
+}
+
+static void display_multiformat_interface(starpu_data_handle_t handle, FILE *f)
+{
+	/* TODO */
+	struct starpu_multiformat_interface *multiformat_interface;
+	multiformat_interface = (struct starpu_multiformat_interface *)
+		starpu_data_get_interface_on_node(handle, 0);
+
+	fprintf(f, "%u\t", multiformat_interface->nx);
+}
+
+/* XXX : returns CPU size */
+static size_t multiformat_interface_get_size(starpu_data_handle_t handle)
+{
+	size_t size;
+	struct starpu_multiformat_interface *multiformat_interface;
+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+	size = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
+	return size;
+}
+
+uint32_t starpu_multiformat_get_nx(starpu_data_handle_t handle)
+{
+	struct starpu_multiformat_interface *multiformat_interface;
+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+	return multiformat_interface->nx;
+}
+
+static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node)
+{
+	struct starpu_multiformat_interface *multiformat_interface;
+	multiformat_interface = (struct starpu_multiformat_interface *) data_interface;
+	enum starpu_node_kind kind = starpu_node_get_kind(node);
+
+	switch(kind)
+	{
+		case STARPU_CPU_RAM:
+			free(multiformat_interface->cpu_ptr);
+			multiformat_interface->cpu_ptr = NULL;
+			break;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			if (multiformat_interface->cpu_ptr)
+			{
+				cudaFree(multiformat_interface->cpu_ptr);
+				multiformat_interface->cpu_ptr = NULL;
+			}
+			if (multiformat_interface->cuda_ptr)
+			{
+				cudaFree(multiformat_interface->cuda_ptr);
+				multiformat_interface->cuda_ptr = NULL;
+			}
+			break;
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			/* TODO */
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+	}
+}
+
+static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32_t dst_node)
+{
+	struct starpu_multiformat_interface *multiformat_interface;
+	multiformat_interface = (struct starpu_multiformat_interface *) data_interface_;
+	unsigned fail = 0;
+	uintptr_t addr = 0;
+	ssize_t allocated_memory;
+
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
+	switch(kind)
+	{
+		case STARPU_CPU_RAM:
+			allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
+			addr = (uintptr_t)malloc(allocated_memory);
+			if (!addr)
+			{
+				fail = 1;
+			}
+			else
+			{
+				multiformat_interface->cpu_ptr = (void *) addr;
+			}
+
+#ifdef STARPU_USE_CUDA
+			multiformat_interface->cuda_ptr = malloc(multiformat_interface->nx * multiformat_interface->ops->cuda_elemsize);
+			STARPU_ASSERT(multiformat_interface->cuda_ptr != NULL);
+#endif
+#ifdef STARPU_USE_OPENCL
+			multiformat_interface->opencl_ptr = malloc(multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize);
+			STARPU_ASSERT(multiformat_interface->opencl_ptr != NULL);
+#endif
+			break;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			{
+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cuda_elemsize;
+				cudaError_t status = cudaMalloc((void **)&addr, allocated_memory);
+				if (STARPU_UNLIKELY(status))
+				{
+					STARPU_CUDA_REPORT_ERROR(status);
+				}
+				else
+				{
+					multiformat_interface->cuda_ptr = (void *)addr;
+				}
+
+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
+				status = cudaMalloc((void **)&multiformat_interface->cpu_ptr, allocated_memory);
+				if (STARPU_UNLIKELY(status != cudaSuccess))
+					STARPU_CUDA_REPORT_ERROR(status);
+				break;
+			}
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+				cl_mem ptr;
+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize;
+                                ret = starpu_opencl_allocate_memory(&ptr, allocated_memory, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret)
+				{
+					fail = 1;
+				}
+				else
+				{
+					multiformat_interface->opencl_ptr = (void *)addr;
+
+				}
+
+				ret = starpu_opencl_allocate_memory(&ptr,
+							multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize,
+							CL_MEM_READ_WRITE);
+				addr = (uintptr_t)ptr;
+				if (ret)
+				{
+					fail = 1;
+				}
+				else
+				{
+					multiformat_interface->cpu_ptr = (void *) addr;
+				}
+				
+				break;
+			}
+#endif
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	if (fail)
+		return -ENOMEM;
+
+	return allocated_memory;
+}
+
+
+
+
+/*
+ * Copy methods
+ */
+static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__ ((unused)),
+			   void *dst_interface, unsigned dst_node __attribute__ ((unused)))
+{
+	struct starpu_multiformat_interface *src_multiformat;
+	struct starpu_multiformat_interface *dst_multiformat;
+
+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+
+	size_t size = dst_multiformat->nx * dst_multiformat->ops->cpu_elemsize;
+	memcpy(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size);
+
+	return 0;
+}
+
+#ifdef STARPU_USE_CUDA
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__ ((unused)),
+			    void *dst_interface, unsigned dst_node __attribute__ ((unused)),
+			    enum cudaMemcpyKind kind)
+{
+	struct starpu_multiformat_interface *src_multiformat;
+	struct starpu_multiformat_interface *dst_multiformat;
+
+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
+
+	size_t size;
+
+	cudaError_t status;
+
+	switch (kind)
+	{
+		case cudaMemcpyHostToDevice:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			if (src_multiformat->cuda_ptr == NULL)
+			{
+				src_multiformat->cuda_ptr = malloc(size);
+				if (src_multiformat->cuda_ptr == NULL)
+					return -ENOMEM;
+			}
+			status = cudaMemcpy(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size, kind);
+			if (STARPU_UNLIKELY(status))
+			{
+				STARPU_CUDA_REPORT_ERROR(status);
+			}
+			break;
+		}
+		case cudaMemcpyDeviceToHost:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (STARPU_UNLIKELY(status))
+				STARPU_CUDA_REPORT_ERROR(status);
+
+			break;
+		}
+		case cudaMemcpyDeviceToDevice:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (STARPU_UNLIKELY(status))
+				STARPU_CUDA_REPORT_ERROR(status);
+			break;
+		}
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	return 0;
+}
+
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
+
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+}
+
+static int copy_cuda_common_async(void *src_interface, unsigned src_node __attribute__ ((unused)),
+				  void *dst_interface, unsigned dst_node __attribute__ ((unused)),
+				  cudaStream_t stream, enum cudaMemcpyKind kind)
+{
+	struct starpu_multiformat_interface *src_multiformat;
+	struct starpu_multiformat_interface *dst_multiformat;
+
+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
+
+	size_t size;
+	cudaError_t status;
+
+	switch (kind)
+	{
+		case cudaMemcpyHostToDevice:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			if (src_multiformat->cuda_ptr == NULL)
+			{
+				src_multiformat->cuda_ptr = malloc(size);
+				if (src_multiformat->cuda_ptr == NULL)
+					return -ENOMEM;
+			}
+
+			status = cudaMemcpyAsync(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size, kind, stream);
+			if (STARPU_UNLIKELY(status))
+			{
+				STARPU_CUDA_REPORT_ERROR(status);
+			}
+			break;
+		}
+		case cudaMemcpyDeviceToHost:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (STARPU_UNLIKELY(status))
+				STARPU_CUDA_REPORT_ERROR(status);
+
+			break;
+		}
+		case cudaMemcpyDeviceToDevice:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			status = cudaMemcpyAsync(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind, stream);
+			if (STARPU_UNLIKELY(status))
+				STARPU_CUDA_REPORT_ERROR(status);
+			break;
+		}
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	return 0;
+}
+
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
+}
+
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
+}
+
+#ifdef HAVE_CUDA_MEMCPY_PEER
+static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
+				void *dst_interface, unsigned dst_node,
+				cudaStream_t stream)
+{
+	struct starpu_multiformat_interface *src_multiformat;
+	struct starpu_multiformat_interface *dst_multiformat;
+
+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(src_multiformat->ops != NULL);
+
+	cudaError_t status;
+	int size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+	int src_dev = _starpu_memory_node_to_devid(src_node);
+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
+
+	if (stream)
+	{
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		status = cudaMemcpyPeerAsync(dst_multiformat->cuda_ptr, dst_dev,
+					     src_multiformat->cuda_ptr, src_dev,
+					     size, stream);
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		/* All good ! Still, returning -EAGAIN, because we will need to
+                   check the transfert completion later */
+		if (status == cudaSuccess)
+			return -EAGAIN;
+	}
+
+	/* Either a synchronous transfert was requested, or the asynchronous one
+           failed. */
+	status = cudaMemcpyPeer(dst_multiformat->cuda_ptr, dst_dev,
+				src_multiformat->cuda_ptr, src_dev,
+				size);
+	if (STARPU_UNLIKELY(status != cudaSuccess))
+		STARPU_CUDA_REPORT_ERROR(status);
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+
+	return 0;
+}
+#endif
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+	if (src_node == dst_node)
+	{
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	}
+	else
+	{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		return copy_cuda_peer_common(src_interface, src_node,
+					     dst_interface, dst_node,
+					     NULL);
+#else
+		STARPU_ASSERT(0);
+#endif
+	}
+}
+
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
+				   void *dst_interface, unsigned dst_node,
+				   cudaStream_t stream)
+{
+	if (src_node == dst_node)
+	{
+		return copy_cuda_common_async(src_interface, src_node,
+					      dst_interface, dst_node,
+					      stream, cudaMemcpyDeviceToDevice);
+	}
+	else
+	{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		return copy_cuda_peer_common(src_interface, src_node,
+					     dst_interface, dst_node,
+					     stream);
+#else
+		STARPU_ASSERT(0);
+#endif
+	}
+}
+#endif /* STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *_event)
+{
+	int err, ret;
+	size_t size;
+	struct starpu_multiformat_interface *src_multiformat;
+	struct starpu_multiformat_interface *dst_multiformat;
+
+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(src_multiformat->ops != NULL);
+
+	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize;
+
+
+	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_multiformat->cpu_ptr,
+							   src_node,
+							   (cl_mem) dst_multiformat->cpu_ptr,
+							   dst_node,
+							   size,
+							   0,
+							   (cl_event *) _event,
+							   &ret);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+	return ret;
+}
+
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *_event)
+{
+	int err, ret;
+	size_t size;
+	struct starpu_multiformat_interface *src_multiformat;
+	struct starpu_multiformat_interface *dst_multiformat;
+
+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(src_multiformat->ops != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+
+	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize;
+
+	if (dst_multiformat->opencl_ptr == NULL) {
+		/* XXX : it is weird that we might have to allocate memory here... */
+		dst_multiformat->opencl_ptr = malloc(dst_multiformat->nx * dst_multiformat->ops->opencl_elemsize);
+	}
+	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_multiformat->opencl_ptr,
+							   src_node,
+							   dst_multiformat->opencl_ptr,
+							   dst_node,
+							   size,
+                                                           0,
+							   (cl_event *)_event,
+							   &ret);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+
+
+	return ret;
+}
+
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
+}
+
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
+}
+
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
+                                 void *dst_interface, unsigned dst_node)
+{
+	(void) src_interface;
+	(void) dst_interface;
+	(void) src_node;
+	(void) dst_node;
+/* TODO */
+	return 0;
+}
+#endif

+ 101 - 78
src/datawizard/interfaces/variable_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,7 @@
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
 #include <datawizard/filters.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
+static const struct starpu_data_copy_methods variable_copy_data_methods_s =
+{
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef STARPU_USE_CUDA
@@ -66,19 +67,20 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static void register_variable_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
 static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node);
-static void *variable_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static void *variable_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
 static void free_variable_buffer_on_node(void *data_interface, uint32_t node);
-static size_t variable_interface_get_size(starpu_data_handle handle);
-static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle);
+static size_t variable_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle);
 static int variable_compare(void *data_interface_a, void *data_interface_b);
-static void display_variable_interface(starpu_data_handle handle, FILE *f);
+static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
 #ifdef STARPU_USE_GORDON
-static int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+static int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
 #endif
 
-static struct starpu_data_interface_ops_t interface_variable_ops = {
+static struct starpu_data_interface_ops interface_variable_ops =
+{
 	.register_data_handle = register_variable_handle,
 	.allocate_data_on_node = allocate_variable_buffer_on_node,
 	.handle_to_pointer = variable_handle_to_pointer,
@@ -91,29 +93,31 @@ static struct starpu_data_interface_ops_t interface_variable_ops = {
 	.convert_to_gordon = convert_variable_to_gordon,
 #endif
 	.interfaceid = STARPU_VARIABLE_INTERFACE_ID,
-	.interface_size = sizeof(starpu_variable_interface_t), 
-	.display = display_variable_interface
+	.interface_size = sizeof(struct starpu_variable_interface),
+	.display = display_variable_interface,
 };
 
-static void *variable_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+static void *variable_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
 {
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
 	return (void*) STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node));
 }
 
-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+static void register_variable_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
 {
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		starpu_variable_interface_t *local_interface = (starpu_variable_interface_t *)
+		struct starpu_variable_interface *local_interface = (struct starpu_variable_interface *)
 			starpu_data_get_interface_on_node(handle, node);
 
-		if (node == home_node) {
+		if (node == home_node)
+		{
 			local_interface->ptr = STARPU_VARIABLE_GET_PTR(data_interface);
 		}
-		else {
+		else
+		{
 			local_interface->ptr = 0;
 		}
 
@@ -122,7 +126,7 @@ static void register_variable_handle(starpu_data_handle handle, uint32_t home_no
 }
 
 #ifdef STARPU_USE_GORDON
-int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
 {
 	*ptr = STARPU_VARIABLE_GET_PTR(interface);
 	(*ss).size = STARPU_VARIABLE_GET_ELEMSIZE(interface);
@@ -132,49 +136,50 @@ int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strid
 #endif
 
 /* declare a new data with the variable interface */
-void starpu_variable_data_register(starpu_data_handle *handleptr, uint32_t home_node,
+void starpu_variable_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
                         uintptr_t ptr, size_t elemsize)
 {
-	starpu_variable_interface_t variable = {
+	struct starpu_variable_interface variable =
+	{
 		.ptr = ptr,
 		.elemsize = elemsize
-	};	
+	};
 
-	starpu_data_register(handleptr, home_node, &variable, &interface_variable_ops); 
+	starpu_data_register(handleptr, home_node, &variable, &interface_variable_ops);
 }
 
 
-static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle)
+static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle)
 {
-	return _starpu_crc32_be(starpu_variable_get_elemsize(handle), 0);
+	return starpu_crc32_be(starpu_variable_get_elemsize(handle), 0);
 }
 
 static int variable_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_variable_interface_t *variable_a = (starpu_variable_interface_t *) data_interface_a;
-	starpu_variable_interface_t *variable_b = (starpu_variable_interface_t *) data_interface_b;
+	struct starpu_variable_interface *variable_a = (struct starpu_variable_interface *) data_interface_a;
+	struct starpu_variable_interface *variable_b = (struct starpu_variable_interface *) data_interface_b;
 
 	/* Two variables are considered compatible if they have the same size */
 	return (variable_a->elemsize == variable_b->elemsize);
-} 
+}
 
-static void display_variable_interface(starpu_data_handle handle, FILE *f)
+static void display_variable_interface(starpu_data_handle_t handle, FILE *f)
 {
-	starpu_variable_interface_t *variable_interface = (starpu_variable_interface_t *)
+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
 }
 
-static size_t variable_interface_get_size(starpu_data_handle handle)
+static size_t variable_interface_get_size(starpu_data_handle_t handle)
 {
-	starpu_variable_interface_t *variable_interface = (starpu_variable_interface_t *)
+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return variable_interface->elemsize;
 }
 
-uintptr_t starpu_variable_get_local_ptr(starpu_data_handle handle)
+uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
@@ -184,7 +189,7 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle handle)
 	return STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node));
 }
 
-size_t starpu_variable_get_elemsize(starpu_data_handle handle)
+size_t starpu_variable_get_elemsize(starpu_data_handle_t handle)
 {
 	return STARPU_VARIABLE_GET_ELEMSIZE(starpu_data_get_interface_on_node(handle, 0));
 }
@@ -194,7 +199,7 @@ size_t starpu_variable_get_elemsize(starpu_data_handle handle)
 /* returns the size of the allocated area */
 static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	starpu_variable_interface_t *variable_interface = (starpu_variable_interface_t *) data_interface_;
+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *) data_interface_;
 
 	unsigned fail = 0;
 	uintptr_t addr = 0;
@@ -202,13 +207,14 @@ static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t
 
 	size_t elemsize = variable_interface->elemsize;
 
-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			addr = (uintptr_t)malloc(elemsize);
 			if (!addr)
@@ -230,17 +236,18 @@ static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
-                                ret = _starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
+				cl_mem ptr;
+                                ret = starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
                                 addr = (uintptr_t)ptr;
-				if (ret) {
+				if (ret)
+				{
 					fail = 1;
 				}
 				break;
 			}
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 	if (fail)
@@ -251,14 +258,15 @@ static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t
 
 	/* update the data properly in consequence */
 	variable_interface->ptr = addr;
-	
+
 	return allocated_memory;
 }
 
 static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_node_kind kind = _starpu_get_node_kind(node);
-	switch(kind) {
+	enum starpu_node_kind kind = starpu_node_get_kind(node);
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void*)STARPU_VARIABLE_GET_PTR(data_interface));
 			break;
@@ -273,7 +281,7 @@ static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
                         break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 }
 
@@ -281,8 +289,8 @@ static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
 				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
-	starpu_variable_interface_t *src_variable = src_interface;
-	starpu_variable_interface_t *dst_variable = dst_interface;
+	struct starpu_variable_interface *src_variable = src_interface;
+	struct starpu_variable_interface *dst_variable = dst_interface;
 
 	cudaError_t cures;
 	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
@@ -290,7 +298,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
 
 	return 0;
 }
@@ -315,20 +323,21 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	{
 		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 	}
-	else {
+	else
+	{
 #ifdef HAVE_CUDA_MEMCPY_PEER
-		int src_dev = starpu_memory_node_to_devid(src_node);
-		int dst_dev = starpu_memory_node_to_devid(dst_node);
+		int src_dev = _starpu_memory_node_to_devid(src_node);
+		int dst_dev = _starpu_memory_node_to_devid(dst_node);
 
-		starpu_variable_interface_t *src_variable = src_interface;
-		starpu_variable_interface_t *dst_variable = dst_interface;
+		struct starpu_variable_interface *src_variable = src_interface;
+		struct starpu_variable_interface *dst_variable = dst_interface;
 
 		cudaError_t cures;
 		cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, src_variable->elemsize);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 
-		STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
 
 #else
 		/* This is illegal without support for cudaMemcpyPeer */
@@ -342,11 +351,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
 					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
-	starpu_variable_interface_t *src_variable = src_interface;
-	starpu_variable_interface_t *dst_variable = dst_interface;
+	struct starpu_variable_interface *src_variable = src_interface;
+	struct starpu_variable_interface *dst_variable = dst_interface;
 
 	cudaError_t cures;
+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
+	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
@@ -358,7 +369,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		return 0;
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
 
 	return -EAGAIN;
 }
@@ -382,18 +393,21 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 	{
 		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
 	}
-	else {
+	else
+	{
 #ifdef HAVE_CUDA_MEMCPY_PEER
-		int src_dev = starpu_memory_node_to_devid(src_node);
-		int dst_dev = starpu_memory_node_to_devid(dst_node);
+		int src_dev = _starpu_memory_node_to_devid(src_node);
+		int dst_dev = _starpu_memory_node_to_devid(dst_node);
 
-		starpu_variable_interface_t *src_variable = src_interface;
-		starpu_variable_interface_t *dst_variable = dst_interface;
+		struct starpu_variable_interface *src_variable = src_interface;
+		struct starpu_variable_interface *dst_variable = dst_interface;
 
 		size_t length = src_variable->elemsize;
 
 		cudaError_t cures;
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeerAsync((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length, stream);
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (cures)
 		{
 			/* sychronous fallback */
@@ -404,7 +418,7 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 			return 0;
 		}
 
-		STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
 
 		return -EAGAIN;
 #else
@@ -422,33 +436,33 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface,
                                     unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_variable_interface_t *src_variable = src_interface;
-	starpu_variable_interface_t *dst_variable = dst_interface;
+	struct starpu_variable_interface *src_variable = src_interface;
+	struct starpu_variable_interface *dst_variable = dst_interface;
         int err,ret;
 
-        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
+        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, src_node, (cl_mem)dst_variable->ptr, dst_node, src_variable->elemsize,
                                                            0, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
 
 	return ret;
 }
 
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_variable_interface_t *src_variable = src_interface;
-	starpu_variable_interface_t *dst_variable = dst_interface;
+	struct starpu_variable_interface *src_variable = src_interface;
+	struct starpu_variable_interface *dst_variable = dst_interface;
         int err, ret;
 
-	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
+	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, src_node, (void*)dst_variable->ptr, dst_node, src_variable->elemsize,
                                                            0, (cl_event*)_event, &ret);
 
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
 
 	return ret;
 }
@@ -467,21 +481,30 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 {
 	cl_int err;
 
-	starpu_variable_interface_t *src_variable = src_interface;
-	starpu_variable_interface_t *dst_variable = dst_interface;
+	struct starpu_variable_interface *src_variable = src_interface;
+	struct starpu_variable_interface *dst_variable = dst_interface;
 
 	cl_mem src_ptr = (cl_mem)src_variable->ptr;
 	cl_mem dst_ptr = (cl_mem)dst_variable->ptr;
 
 	cl_command_queue cq;
 	starpu_opencl_get_current_queue(&cq);
+	cl_event event;
 
 	STARPU_ASSERT(src_variable->elemsize == dst_variable->elemsize);
-	err= clEnqueueCopyBuffer(cq, src_ptr, dst_ptr, 0, 0, src_variable->elemsize, 0, NULL, NULL);
+	err= clEnqueueCopyBuffer(cq, src_ptr, dst_ptr, 0, 0, src_variable->elemsize, 0, NULL, &event);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clWaitForEvents(1, &event);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clReleaseEvent(event);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
 
 	return 0;
 }
@@ -490,8 +513,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_variable_interface_t *src_variable = (starpu_variable_interface_t *) src_interface;
-	starpu_variable_interface_t *dst_variable = (starpu_variable_interface_t *) dst_interface;
+	struct starpu_variable_interface *src_variable = (struct starpu_variable_interface *) src_interface;
+	struct starpu_variable_interface *dst_variable = (struct starpu_variable_interface *) dst_interface;
 
 	size_t elemsize = dst_variable->elemsize;
 
@@ -500,7 +523,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
 	memcpy((void *)ptr_dst, (void *)ptr_src, elemsize);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, elemsize);
 
 	return 0;
 }

+ 53 - 45
src/datawizard/interfaces/vector_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -22,9 +22,9 @@
 
 void starpu_block_filter_func_vector(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
-        starpu_vector_interface_t *vector_father = (starpu_vector_interface_t *) father_interface;
-        starpu_vector_interface_t *vector_child = (starpu_vector_interface_t *) child_interface;
-	
+        struct starpu_vector_interface *vector_father = (struct starpu_vector_interface *) father_interface;
+        struct starpu_vector_interface *vector_child = (struct starpu_vector_interface *) child_interface;
+
 	uint32_t nx = vector_father->nx;
 	size_t elemsize = vector_father->elemsize;
 
@@ -33,16 +33,18 @@ void starpu_block_filter_func_vector(void *father_interface, void *child_interfa
 	uint32_t chunk_size = (nx + nchunks - 1)/nchunks;
 	size_t offset = id*chunk_size*elemsize;
 
-	uint32_t child_nx = 
+	uint32_t child_nx =
 	  STARPU_MIN(chunk_size, nx - id*chunk_size);
 
 	vector_child->nx = child_nx;
 	vector_child->elemsize = elemsize;
 
-	if (vector_father->ptr) {
-	  vector_child->ptr = vector_father->ptr + offset;
-	  vector_child->dev_handle = vector_father->dev_handle;
-	  vector_child->offset = vector_father->offset + offset;
+	if (vector_father->dev_handle)
+	{
+		if (vector_father->ptr)
+			vector_child->ptr = vector_father->ptr + offset;
+		vector_child->dev_handle = vector_father->dev_handle;
+		vector_child->offset = vector_father->offset + offset;
 	}
 }
 
@@ -51,9 +53,9 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 {
         /* there cannot be more than 2 chunks */
         STARPU_ASSERT(id < 2);
-	
-	starpu_vector_interface_t *vector_father = (starpu_vector_interface_t *) father_interface;
-	starpu_vector_interface_t *vector_child = (starpu_vector_interface_t *) child_interface;
+
+	struct starpu_vector_interface *vector_father = (struct starpu_vector_interface *) father_interface;
+	struct starpu_vector_interface *vector_child = (struct starpu_vector_interface *) child_interface;
 
 	uint32_t length_first = f->filter_arg;
 
@@ -61,37 +63,41 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 	size_t elemsize = vector_father->elemsize;
 
 	STARPU_ASSERT(length_first < nx);
-	
+
 	/* this is the first child */
-	if (id == 0) {
-	  vector_child->nx = length_first;
-	  vector_child->elemsize = elemsize;
-
-	  if (vector_father->ptr) {
-	    vector_child->ptr = vector_father->ptr;
-	    vector_child->offset = vector_father->offset;
-	    vector_child->dev_handle = vector_father->dev_handle;
-	  }
+	if (id == 0)
+	{
+		vector_child->nx = length_first;
+		vector_child->elemsize = elemsize;
+
+		if (vector_father->dev_handle)
+		{
+			if (vector_father->ptr)
+				vector_child->ptr = vector_father->ptr;
+			vector_child->offset = vector_father->offset;
+			vector_child->dev_handle = vector_father->dev_handle;
+		}
 	}
-
-	/* the second child */
-	else {
-	  vector_child->nx = nx - length_first;
-	  vector_child->elemsize = elemsize;
-
-	  if (vector_father->ptr) {
-	    vector_child->ptr = vector_father->ptr + length_first*elemsize;
-	    vector_child->offset = vector_father->offset + length_first*elemsize;
-	    vector_child->dev_handle = vector_father->dev_handle;
-	  }
+	else /* the second child */
+	{
+		vector_child->nx = nx - length_first;
+		vector_child->elemsize = elemsize;
+
+		if (vector_father->dev_handle)
+		{
+			if (vector_father->ptr)
+				vector_child->ptr = vector_father->ptr + length_first*elemsize;
+			vector_child->offset = vector_father->offset + length_first*elemsize;
+			vector_child->dev_handle = vector_father->dev_handle;
+		}
 	}
 }
 
 
 void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nchunks)
 {
-        starpu_vector_interface_t *vector_father = (starpu_vector_interface_t *) father_interface;
-        starpu_vector_interface_t *vector_child = (starpu_vector_interface_t *) child_interface;
+        struct starpu_vector_interface *vector_father = (struct starpu_vector_interface *) father_interface;
+        struct starpu_vector_interface *vector_child = (struct starpu_vector_interface *) child_interface;
 
         uint32_t *length_tab = (uint32_t *) f->filter_arg_ptr;
 
@@ -103,15 +109,17 @@ void starpu_vector_list_filter_func(void *father_interface, void *child_interfac
 
 	vector_child->nx = chunk_size;
 	vector_child->elemsize = elemsize;
-	
-	if (vector_father->ptr) {
-	  /* compute the current position */
-	  unsigned i;
-	  for (i = 0; i < id; i++) 
-	    current_pos += length_tab[i];
-	  
-	  vector_child->ptr = vector_father->ptr + current_pos*elemsize;
-	  vector_child->offset = vector_father->offset + current_pos*elemsize;
-	  vector_child->dev_handle = vector_father->dev_handle;
+
+	if (vector_father->dev_handle)
+	{
+		/* compute the current position */
+		unsigned i;
+		for (i = 0; i < id; i++)
+			current_pos += length_tab[i];
+
+		if (vector_father->ptr)
+			vector_child->ptr = vector_father->ptr + current_pos*elemsize;
+		vector_child->offset = vector_father->offset + current_pos*elemsize;
+		vector_child->dev_handle = vector_father->dev_handle;
 	}
 }

+ 111 - 87
src/datawizard/interfaces/vector_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,7 @@
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
 #include <datawizard/filters.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
+static const struct starpu_data_copy_methods vector_copy_data_methods_s =
+{
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef STARPU_USE_CUDA
@@ -66,19 +67,20 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static void register_vector_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
 static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node);
-static void *vector_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static void *vector_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
 static void free_vector_buffer_on_node(void *data_interface, uint32_t node);
-static size_t vector_interface_get_size(starpu_data_handle handle);
-static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle);
+static size_t vector_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle);
 static int vector_compare(void *data_interface_a, void *data_interface_b);
-static void display_vector_interface(starpu_data_handle handle, FILE *f);
+static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
 #ifdef STARPU_USE_GORDON
-static int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+static int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
 #endif
 
-static struct starpu_data_interface_ops_t interface_vector_ops = {
+static struct starpu_data_interface_ops interface_vector_ops =
+{
 	.register_data_handle = register_vector_handle,
 	.allocate_data_on_node = allocate_vector_buffer_on_node,
 	.handle_to_pointer = vector_handle_to_pointer,
@@ -91,36 +93,38 @@ static struct starpu_data_interface_ops_t interface_vector_ops = {
 	.convert_to_gordon = convert_vector_to_gordon,
 #endif
 	.interfaceid = STARPU_VECTOR_INTERFACE_ID,
-	.interface_size = sizeof(starpu_vector_interface_t), 
-	.display = display_vector_interface
+	.interface_size = sizeof(struct starpu_vector_interface),
+	.display = display_vector_interface,
 };
 
-static void *vector_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+static void *vector_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
 {
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return (void*) vector_interface->ptr;
 }
 
-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+static void register_vector_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
 {
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *) data_interface;
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *) data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-		starpu_vector_interface_t *local_interface = (starpu_vector_interface_t *)
+		struct starpu_vector_interface *local_interface = (struct starpu_vector_interface *)
 			starpu_data_get_interface_on_node(handle, node);
 
-		if (node == home_node) {
+		if (node == home_node)
+		{
 			local_interface->ptr = vector_interface->ptr;
                         local_interface->dev_handle = vector_interface->dev_handle;
                         local_interface->offset = vector_interface->offset;
 		}
-		else {
+		else
+		{
 			local_interface->ptr = 0;
                         local_interface->dev_handle = 0;
                         local_interface->offset = 0;
@@ -132,10 +136,10 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 }
 
 #ifdef STARPU_USE_GORDON
-int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
 {
-	starpu_vector_interface_t *vector_interface = interface;
-	
+	struct starpu_vector_interface *vector_interface = interface;
+
 	*ptr = vector_interface->ptr;
 	(*ss).size = vector_interface->nx * vector_interface->elemsize;
 
@@ -144,48 +148,49 @@ int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideS
 #endif
 
 /* declare a new data with the vector interface */
-void starpu_vector_data_register(starpu_data_handle *handleptr, uint32_t home_node,
+void starpu_vector_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
                         uintptr_t ptr, uint32_t nx, size_t elemsize)
 {
-	starpu_vector_interface_t vector = {
+	struct starpu_vector_interface vector =
+	{
 		.ptr = ptr,
 		.nx = nx,
 		.elemsize = elemsize,
                 .dev_handle = ptr,
                 .offset = 0
-	};	
+	};
 
-	starpu_data_register(handleptr, home_node, &vector, &interface_vector_ops); 
+	starpu_data_register(handleptr, home_node, &vector, &interface_vector_ops);
 }
 
 
-static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle)
+static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle)
 {
-	return _starpu_crc32_be(starpu_vector_get_nx(handle), 0);
+	return starpu_crc32_be(starpu_vector_get_nx(handle), 0);
 }
 
 static int vector_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_vector_interface_t *vector_a = (starpu_vector_interface_t *) data_interface_a;
-	starpu_vector_interface_t *vector_b = (starpu_vector_interface_t *) data_interface_b;
+	struct starpu_vector_interface *vector_a = (struct starpu_vector_interface *) data_interface_a;
+	struct starpu_vector_interface *vector_b = (struct starpu_vector_interface *) data_interface_b;
 
 	/* Two vectors are considered compatible if they have the same size */
 	return ((vector_a->nx == vector_b->nx)
 			&& (vector_a->elemsize == vector_b->elemsize));
 }
 
-static void display_vector_interface(starpu_data_handle handle, FILE *f)
+static void display_vector_interface(starpu_data_handle_t handle, FILE *f)
 {
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	fprintf(f, "%u\t", vector_interface->nx);
 }
 
-static size_t vector_interface_get_size(starpu_data_handle handle)
+static size_t vector_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	size = vector_interface->nx*vector_interface->elemsize;
@@ -194,30 +199,30 @@ static size_t vector_interface_get_size(starpu_data_handle handle)
 }
 
 /* offer an access to the data parameters */
-uint32_t starpu_vector_get_nx(starpu_data_handle handle)
+uint32_t starpu_vector_get_nx(starpu_data_handle_t handle)
 {
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return vector_interface->nx;
 }
 
-uintptr_t starpu_vector_get_local_ptr(starpu_data_handle handle)
+uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle)
 {
 	unsigned node;
 	node = _starpu_get_local_memory_node();
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
 	return vector_interface->ptr;
 }
 
-size_t starpu_vector_get_elemsize(starpu_data_handle handle)
+size_t starpu_vector_get_elemsize(starpu_data_handle_t handle)
 {
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
 	return vector_interface->elemsize;
@@ -228,24 +233,25 @@ size_t starpu_vector_get_elemsize(starpu_data_handle handle)
 /* returns the size of the allocated area */
 static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *) data_interface_;
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *) data_interface_;
 
 	unsigned fail = 0;
-	uintptr_t addr = 0;
+	uintptr_t addr = 0, handle = 0;
 	ssize_t allocated_memory;
 
 	uint32_t nx = vector_interface->nx;
 	size_t elemsize = vector_interface->elemsize;
 
-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 
-	switch(kind) {
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
-			addr = (uintptr_t)malloc(nx*elemsize);
+			addr = handle = (uintptr_t)malloc(nx*elemsize);
 			if (!addr)
 				fail = 1;
 			break;
@@ -259,23 +265,25 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 
 				fail = 1;
 			}
+			handle = addr;
 			break;
 #endif
 #ifdef STARPU_USE_OPENCL
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*elemsize, CL_MEM_READ_WRITE);
-                                addr = (uintptr_t)ptr;
-				if (ret) {
+				cl_mem mem;
+                                ret = starpu_opencl_allocate_memory(&mem, nx*elemsize, CL_MEM_READ_WRITE);
+				handle = (uintptr_t)mem;
+				if (ret)
+				{
 					fail = 1;
 				}
 				break;
 			}
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 
 	if (fail)
@@ -286,22 +294,23 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 
 	/* update the data properly in consequence */
 	vector_interface->ptr = addr;
-        vector_interface->dev_handle = addr;
+	vector_interface->dev_handle = handle;
         vector_interface->offset = 0;
-	
+
 	return allocated_memory;
 }
 
 static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *) data_interface;
+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *) data_interface;
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t cures;
 #endif
 
-	starpu_node_kind kind = _starpu_get_node_kind(node);
-	switch(kind) {
+	enum starpu_node_kind kind = starpu_node_get_kind(node);
+	switch(kind)
+	{
 		case STARPU_CPU_RAM:
 			free((void*)vector_interface->ptr);
 			break;
@@ -313,11 +322,11 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 #endif
 #ifdef STARPU_USE_OPENCL
                 case STARPU_OPENCL_RAM:
-                        clReleaseMemObject((void *)vector_interface->ptr);
+			clReleaseMemObject((cl_mem)vector_interface->dev_handle);
                         break;
 #endif
 		default:
-			assert(0);
+			STARPU_ASSERT(0);
 	}
 }
 
@@ -325,8 +334,8 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
 				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
+	struct starpu_vector_interface *src_vector = src_interface;
+	struct starpu_vector_interface *dst_vector = dst_interface;
 
 	cudaError_t cures;
 
@@ -334,7 +343,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 
 	return 0;
 }
@@ -346,19 +355,21 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 {
 	cudaError_t cures;
 
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
+	struct starpu_vector_interface *src_vector = src_interface;
+	struct starpu_vector_interface *dst_vector = dst_interface;
 
 	size_t length = src_vector->nx*src_vector->elemsize;
 
-	int src_dev = starpu_memory_node_to_devid(src_node);
-	int dst_dev = starpu_memory_node_to_devid(dst_node);
+	int src_dev = _starpu_memory_node_to_devid(src_node);
+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
 
 	if (is_async)
 	{
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeerAsync((char *)dst_vector->ptr, dst_dev,
 						(char *)src_vector->ptr, src_dev,
 						length, stream);
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -368,7 +379,7 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
 
 	return 0;
 }
@@ -393,7 +404,8 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	{
 		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 	}
-	else {
+	else
+	{
 #ifdef HAVE_CUDA_MEMCPY_PEER
 		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 0, 0);
 #else
@@ -408,12 +420,14 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
 					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
+	struct starpu_vector_interface *src_vector = src_interface;
+	struct starpu_vector_interface *dst_vector = dst_interface;
 
 	cudaError_t cures;
 
+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
+	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
@@ -424,18 +438,19 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		return 0;
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 
 	return -EAGAIN;
 }
 
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
 	if (src_node == dst_node)
 	{
 		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
 	}
-	else {
+	else
+	{
 #ifdef HAVE_CUDA_MEMCPY_PEER
 		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 1, stream);
 #else
@@ -464,17 +479,17 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
                                     void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
+	struct starpu_vector_interface *src_vector = src_interface;
+	struct starpu_vector_interface *dst_vector = dst_interface;
         int err, ret;
 
-	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle,
+	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, src_node, (cl_mem)dst_vector->dev_handle, dst_node,
                                                            src_vector->nx*src_vector->elemsize,
                                                            dst_vector->offset, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 
 	return ret;
 }
@@ -482,16 +497,16 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
                                     void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
+	struct starpu_vector_interface *src_vector = src_interface;
+	struct starpu_vector_interface *dst_vector = dst_interface;
         int err, ret;
 
-	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
+	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, src_node, (void*)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize,
                                                            src_vector->offset, (cl_event *)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 
 	return ret;
 }
@@ -513,19 +528,28 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 {
         int err;
 
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
+	struct starpu_vector_interface *src_vector = src_interface;
+	struct starpu_vector_interface *dst_vector = dst_interface;
 
 	cl_command_queue cq;
 	starpu_opencl_get_current_queue(&cq);
 
 	size_t size = src_vector->nx*src_vector->elemsize;
+	cl_event event;
+
+	err = clEnqueueCopyBuffer(cq, (cl_mem)src_vector->dev_handle, (cl_mem)dst_vector->dev_handle, src_vector->offset, dst_vector->offset, size, 0, NULL, &event);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clWaitForEvents(1, &event);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = clEnqueueCopyBuffer(cq, (cl_mem)src_vector->dev_handle, (cl_mem)dst_vector->dev_handle, src_vector->offset, dst_vector->offset, size, 0, NULL, NULL); 
+	err = clReleaseEvent(event);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 
 	return 0;
 }
@@ -536,8 +560,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
 					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_vector_interface_t *src_vector = (starpu_vector_interface_t *) src_interface;
-	starpu_vector_interface_t *dst_vector = (starpu_vector_interface_t *) dst_interface;
+	struct starpu_vector_interface *src_vector = (struct starpu_vector_interface *) src_interface;
+	struct starpu_vector_interface *dst_vector = (struct starpu_vector_interface *) dst_interface;
 
 	uint32_t nx = dst_vector->nx;
 	size_t elemsize = dst_vector->elemsize;
@@ -547,7 +571,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
 	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
 
 	return 0;
 }

+ 16 - 14
src/datawizard/interfaces/void_interface.c

@@ -20,7 +20,7 @@
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
 #include <datawizard/filters.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
@@ -33,7 +33,8 @@ static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *d
 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods void_copy_data_methods_s = {
+static const struct starpu_data_copy_methods void_copy_data_methods_s =
+{
 	.ram_to_ram = dummy_copy,
 	.ram_to_spu = dummy_copy,
 #ifdef STARPU_USE_CUDA
@@ -56,15 +57,16 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 	.spu_to_spu = dummy_copy
 };
 
-static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static void register_void_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
 static ssize_t allocate_void_buffer_on_node(void *data_interface_, uint32_t dst_node);
 static void free_void_buffer_on_node(void *data_interface, uint32_t node);
-static size_t void_interface_get_size(starpu_data_handle handle);
-static uint32_t footprint_void_interface_crc32(starpu_data_handle handle);
+static size_t void_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_void_interface_crc32(starpu_data_handle_t handle);
 static int void_compare(void *data_interface_a, void *data_interface_b);
-static void display_void_interface(starpu_data_handle handle, FILE *f);
+static void display_void_interface(starpu_data_handle_t handle, FILE *f);
 
-static struct starpu_data_interface_ops_t interface_void_ops = {
+static struct starpu_data_interface_ops interface_void_ops =
+{
 	.register_data_handle = register_void_handle,
 	.allocate_data_on_node = allocate_void_buffer_on_node,
 	.free_data_on_node = free_void_buffer_on_node,
@@ -73,11 +75,11 @@ static struct starpu_data_interface_ops_t interface_void_ops = {
 	.footprint = footprint_void_interface_crc32,
 	.compare = void_compare,
 	.interfaceid = STARPU_VOID_INTERFACE_ID,
-	.interface_size = 0, 
+	.interface_size = 0,
 	.display = display_void_interface
 };
 
-static void register_void_handle(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED,
+static void register_void_handle(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED,
 				uint32_t home_node STARPU_ATTRIBUTE_UNUSED,
 				void *data_interface STARPU_ATTRIBUTE_UNUSED)
 {
@@ -85,13 +87,13 @@ static void register_void_handle(starpu_data_handle handle STARPU_ATTRIBUTE_UNUS
 }
 
 /* declare a new data with the void interface */
-void starpu_void_data_register(starpu_data_handle *handleptr)
+void starpu_void_data_register(starpu_data_handle_t *handleptr)
 {
-	starpu_data_register(handleptr, 0, NULL, &interface_void_ops); 
+	starpu_data_register(handleptr, 0, NULL, &interface_void_ops);
 }
 
 
-static uint32_t footprint_void_interface_crc32(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
+static uint32_t footprint_void_interface_crc32(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED)
 {
 	return 0;
 }
@@ -104,12 +106,12 @@ static int void_compare(void *data_interface_a STARPU_ATTRIBUTE_UNUSED,
 	return 1;
 }
 
-static void display_void_interface(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED, FILE *f)
+static void display_void_interface(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, FILE *f)
 {
 	fprintf(f, "void\t");
 }
 
-static size_t void_interface_get_size(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
+static size_t void_interface_get_size(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED)
 {
 	return 0;
 }

+ 317 - 144
src/datawizard/memalloc.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,24 +17,39 @@
 
 #include <datawizard/memalloc.h>
 #include <datawizard/footprint.h>
+#include <starpu_cuda.h>
+#include <starpu_opencl.h>
 
 /* This per-node RW-locks protect mc_list and memchunk_cache entries */
-static pthread_rwlock_t mc_rwlock[STARPU_MAXNODES]; 
+static pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
+
+/* This per-node RW-locks protect lru_list */
+static pthread_rwlock_t lru_rwlock[STARPU_MAXNODES];
+
+/* Last Recently used memory chunkgs */
+static struct _starpu_mem_chunk_lru_list *starpu_lru_list[STARPU_MAXNODES];
 
 /* Potentially in use memory chunks */
-static starpu_mem_chunk_list_t mc_list[STARPU_MAXNODES];
+static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
 
 /* Explicitly caches memory chunks that can be reused */
-static starpu_mem_chunk_list_t memchunk_cache[STARPU_MAXNODES];
+static struct _starpu_mem_chunk_list *memchunk_cache[STARPU_MAXNODES];
+
+/* When reclaiming memory to allocate, we reclaim MAX(what_is_to_reclaim_on_device, data_size_coefficient*data_size) */
+const unsigned starpu_memstrategy_data_size_coefficient=2;
+
+static void starpu_lru(unsigned node);
 
 void _starpu_init_mem_chunk_lists(void)
 {
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
-		PTHREAD_RWLOCK_INIT(&mc_rwlock[i], NULL);
-		mc_list[i] = starpu_mem_chunk_list_new();
-		memchunk_cache[i] = starpu_mem_chunk_list_new();
+		_STARPU_PTHREAD_RWLOCK_INIT(&mc_rwlock[i], NULL);
+		_STARPU_PTHREAD_RWLOCK_INIT(&lru_rwlock[i], NULL);
+		mc_list[i] = _starpu_mem_chunk_list_new();
+		starpu_lru_list[i] = _starpu_mem_chunk_lru_list_new();
+		memchunk_cache[i] = _starpu_mem_chunk_list_new();
 	}
 }
 
@@ -43,8 +58,9 @@ void _starpu_deinit_mem_chunk_lists(void)
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
-		starpu_mem_chunk_list_delete(mc_list[i]);
-		starpu_mem_chunk_list_delete(memchunk_cache[i]);
+		_starpu_mem_chunk_list_delete(mc_list[i]);
+		_starpu_mem_chunk_list_delete(memchunk_cache[i]);
+		_starpu_mem_chunk_lru_list_delete(starpu_lru_list[i]);
 	}
 }
 
@@ -52,7 +68,7 @@ void _starpu_deinit_mem_chunk_lists(void)
  *	Manipulate subtrees
  */
 
-static void lock_all_subtree(starpu_data_handle handle)
+static void lock_all_subtree(starpu_data_handle_t handle)
 {
 	if (handle->nchildren == 0)
 	{
@@ -60,7 +76,8 @@ static void lock_all_subtree(starpu_data_handle handle)
 		while (_starpu_spin_trylock(&handle->header_lock))
 			_starpu_datawizard_progress(_starpu_get_local_memory_node(), 0);
 	}
-	else {
+	else
+	{
 		/* lock all sub-subtrees children */
 		unsigned child;
 		for (child = 0; child < handle->nchildren; child++)
@@ -70,15 +87,16 @@ static void lock_all_subtree(starpu_data_handle handle)
 	}
 }
 
-static void unlock_all_subtree(starpu_data_handle handle)
+static void unlock_all_subtree(starpu_data_handle_t handle)
 {
 	if (handle->nchildren == 0)
 	{
-		/* this is a leaf */	
+		/* this is a leaf */
 		_starpu_spin_unlock(&handle->header_lock);
 	}
-	else {
-		/* lock all sub-subtrees children 
+	else
+	{
+		/* lock all sub-subtrees children
 		 * Note that this is done in the reverse order of the
 		 * lock_all_subtree so that we avoid deadlock */
 		unsigned i;
@@ -90,16 +108,16 @@ static void unlock_all_subtree(starpu_data_handle handle)
 	}
 }
 
-static unsigned may_free_subtree(starpu_data_handle handle, unsigned node)
+static unsigned may_free_subtree(starpu_data_handle_t handle, unsigned node)
 {
 	/* we only free if no one refers to the leaf */
 	uint32_t refcnt = _starpu_get_data_refcnt(handle, node);
 	if (refcnt)
 		return 0;
-	
+
 	if (!handle->nchildren)
 		return 1;
-	
+
 	/* look into all sub-subtrees children */
 	unsigned child;
 	for (child = 0; child < handle->nchildren; child++)
@@ -113,8 +131,8 @@ static unsigned may_free_subtree(starpu_data_handle handle, unsigned node)
 	return 1;
 }
 
-static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_node, 
-						unsigned dst_node)
+static void transfer_subtree_to_node(starpu_data_handle_t handle, unsigned src_node,
+				     unsigned dst_node)
 {
 	unsigned i;
 	unsigned last = 0;
@@ -123,11 +141,12 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 
 	if (handle->nchildren == 0)
 	{
-		struct starpu_data_replicate_s *src_replicate = &handle->per_node[src_node];
-		struct starpu_data_replicate_s *dst_replicate = &handle->per_node[dst_node];
+		struct _starpu_data_replicate *src_replicate = &handle->per_node[src_node];
+		struct _starpu_data_replicate *dst_replicate = &handle->per_node[dst_node];
 
 		/* this is a leaf */
-		switch(src_replicate->state) {
+		switch(src_replicate->state)
+		{
 		case STARPU_OWNER:
 			/* the local node has the only copy */
 			/* the owner is now the destination_node */
@@ -138,14 +157,19 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 #warning we should use requests during memory reclaim
 #endif
 			/* TODO use request !! */
+			/* Take temporary references on the replicates */
 			src_replicate->refcnt++;
 			dst_replicate->refcnt++;
+			handle->busy_count+=2;
 
 			ret = _starpu_driver_copy_data_1_to_1(handle, src_replicate, dst_replicate, 0, NULL, 1);
 			STARPU_ASSERT(ret == 0);
 
 			src_replicate->refcnt--;
 			dst_replicate->refcnt--;
+			STARPU_ASSERT(handle->busy_count >= 2);
+			handle->busy_count -= 2;
+			_starpu_data_check_not_busy(handle);
 
 			break;
 		case STARPU_SHARED:
@@ -156,11 +180,13 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 			cnt = 0;
 			for (i = 0; i < STARPU_MAXNODES; i++)
 			{
-				if (handle->per_node[i].state == STARPU_SHARED) {
-					cnt++; 
+				if (handle->per_node[i].state == STARPU_SHARED)
+				{
+					cnt++;
 					last = i;
 				}
 			}
+			STARPU_ASSERT(cnt > 0);
 
 			if (cnt == 1)
 				handle->per_node[last].state = STARPU_OWNER;
@@ -174,7 +200,8 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 			break;
 		}
 	}
-	else {
+	else
+	{
 		/* lock all sub-subtrees children */
 		unsigned child;
 		for (child = 0; child < handle->nchildren; child++)
@@ -185,20 +212,20 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 	}
 }
 
-static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
+static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, uint32_t node)
 {
 	size_t freed = 0;
 
 	STARPU_ASSERT(mc->ops);
 	STARPU_ASSERT(mc->ops->free_data_on_node);
 
-	starpu_data_handle handle = mc->data;
+	starpu_data_handle_t handle = mc->data;
 
 	/* Does this memory chunk refers to a handle that does not exist
 	 * anymore ? */
 	unsigned data_was_deleted = mc->data_was_deleted;
 
-	struct starpu_data_replicate_s *replicate = mc->replicate;
+	struct _starpu_data_replicate *replicate = mc->replicate;
 
 //	while (_starpu_spin_trylock(&handle->header_lock))
 //		_starpu_datawizard_progress(_starpu_get_local_memory_node());
@@ -208,20 +235,20 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 #endif
 //	_starpu_spin_lock(&handle->header_lock);
 
-	if (mc->automatically_allocated && 
+	if (mc->automatically_allocated &&
 		(!handle || data_was_deleted || replicate->refcnt == 0))
 	{
 		if (handle && !data_was_deleted)
 			STARPU_ASSERT(replicate->allocated);
 
 #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
-		if (_starpu_get_node_kind(node) == STARPU_CUDA_RAM)
+		if (starpu_node_get_kind(node) == STARPU_CUDA_RAM)
 		{
 			/* To facilitate the design of interface, we set the
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the free method of each
 			 * interface. */
-			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(node));
+			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(node));
 			STARPU_ASSERT(err == cudaSuccess);
 		}
 #endif
@@ -249,35 +276,47 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 
 
 
-static size_t do_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
+static size_t do_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 {
 	size_t size;
 
+	mc->replicate->mc=NULL;
+
 	/* free the actual buffer */
 	size = free_memory_on_node(mc, node);
 
 	/* remove the mem_chunk from the list */
-	starpu_mem_chunk_list_erase(mc_list[node], mc);
+	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 
 	free(mc->chunk_interface);
-	starpu_mem_chunk_delete(mc);
+	_starpu_mem_chunk_delete(mc);
 
-	return size; 
+	return size;
 }
 
 /* This function is called for memory chunks that are possibly in used (ie. not
  * in the cache). They should therefore still be associated to a handle. */
-static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
+static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 {
 	size_t freed = 0;
 
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 	handle = mc->data;
 	STARPU_ASSERT(handle);
 
-	/* Either it's a "relaxed coherency" memchunk, or it's a memchunk that
-	 * could be used with filters. */
-	if (mc->relaxed_coherency)
+	/* This data should be written through to this node, avoid dropping it! */
+	if (handle->wt_mask & (1<<node))
+		return 0;
+
+	/* REDUX memchunk */
+	if (mc->relaxed_coherency == 2)
+	{
+		/* TODO: reduce it back to e.g. main memory */
+	}
+	else
+	/* Either it's a "relaxed coherency" memchunk (SCRATCH), or it's a
+	 * memchunk that could be used with filters. */
+	if (mc->relaxed_coherency == 1)
 	{
 		STARPU_ASSERT(mc->replicate);
 
@@ -295,25 +334,35 @@ static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
 
 		_starpu_spin_unlock(&handle->header_lock);
 	}
-	else {
+	else
+	{
 		/* try to lock all the leafs of the subtree */
 		lock_all_subtree(handle);
-	
+
 		/* check if they are all "free" */
 		if (may_free_subtree(handle, node))
 		{
 			STARPU_ASSERT(handle->per_node[node].refcnt == 0);
-	
-			/* in case there was nobody using that buffer, throw it 
+
+#ifdef STARPU_MEMORY_STATUS
+			if (handle->per_node[node].state == STARPU_OWNER)
+				_starpu_handle_stats_invalidated(handle, node);
+			/* else XXX Considering only owner to invalidate */
+#endif
+
+			/* in case there was nobody using that buffer, throw it
 			 * away after writing it back to main memory */
 			transfer_subtree_to_node(handle, node, 0);
-	
+
+#ifdef STARPU_MEMORY_STATUS
+			_starpu_handle_stats_loaded_owner(handle, 0);
+#endif
 			STARPU_ASSERT(handle->per_node[node].refcnt == 0);
-	
+
 			/* now the actual buffer may be freed */
 			freed = do_free_mem_chunk(mc, node);
 		}
-	
+
 		/* unlock the leafs */
 		unlock_all_subtree(handle);
 	}
@@ -324,21 +373,18 @@ static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
 /* We assume that mc_rwlock[node] is taken. is_already_in_mc_list indicates
  * that the mc is already in the list of buffers that are possibly used, and
  * therefore not in the cache. */
-static void reuse_mem_chunk(unsigned node, struct starpu_data_replicate_s *new_replicate, starpu_mem_chunk_t mc, unsigned is_already_in_mc_list)
+static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
 {
-	starpu_data_handle old_data;
-	old_data = mc->data;
-
 	/* we found an appropriate mem chunk: so we get it out
 	 * of the "to free" list, and reassign it to the new
 	 * piece of data */
 
 	if (!is_already_in_mc_list)
 	{
-		starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
+		_starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
 	}
 
-	struct starpu_data_replicate_s *old_replicate = mc->replicate;
+	struct _starpu_data_replicate *old_replicate = mc->replicate;
 	old_replicate->allocated = 0;
 	old_replicate->automatically_allocated = 0;
 	old_replicate->initialized = 0;
@@ -347,27 +393,27 @@ static void reuse_mem_chunk(unsigned node, struct starpu_data_replicate_s *new_r
 	new_replicate->automatically_allocated = 1;
 	new_replicate->initialized = 0;
 
-	STARPU_ASSERT(new_replicate->chunk_interface);
+	STARPU_ASSERT(new_replicate->data_interface);
 	STARPU_ASSERT(mc->chunk_interface);
-	memcpy(new_replicate->chunk_interface, mc->chunk_interface, old_replicate->ops->interface_size);
+	memcpy(new_replicate->data_interface, mc->chunk_interface, old_replicate->handle->ops->interface_size);
 
 	mc->data = new_replicate->handle;
 	mc->data_was_deleted = 0;
 	/* mc->ops, mc->size, mc->footprint and mc->interface should be
  	 * unchanged ! */
-	
+
 	/* reinsert the mem chunk in the list of active memory chunks */
 	if (!is_already_in_mc_list)
 	{
-		starpu_mem_chunk_list_push_front(mc_list[node], mc);
+		_starpu_mem_chunk_list_push_front(mc_list[node], mc);
 	}
 }
 
-static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, starpu_data_handle new_data, unsigned is_already_in_mc_list)
+static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
 {
 	unsigned success = 0;
 
-	starpu_data_handle old_data;
+	starpu_data_handle_t old_data;
 
 	old_data = mc->data;
 
@@ -381,12 +427,12 @@ static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, sta
 	{
 		success = 1;
 
-		/* in case there was nobody using that buffer, throw it 
+		/* in case there was nobody using that buffer, throw it
 		 * away after writing it back to main memory */
 		transfer_subtree_to_node(old_data, node, 0);
 
 		/* now replace the previous data */
-		reuse_mem_chunk(node, new_data, mc, is_already_in_mc_list);
+		reuse_mem_chunk(node, replicate, mc, is_already_in_mc_list);
 	}
 
 	/* unlock the leafs */
@@ -395,38 +441,38 @@ static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, sta
 	return success;
 }
 
-static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_data_interface_ops_t *ops_a,
-                                          void *data_interface_b, struct starpu_data_interface_ops_t *ops_b)
+static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_data_interface_ops *ops_a,
+                                          void *data_interface_b, struct starpu_data_interface_ops *ops_b)
 {
 	if (ops_a->interfaceid != ops_b->interfaceid)
 		return -1;
 
-	int ret = ops_a->compare(interface_a, interface_b);
+	int ret = ops_a->compare(data_interface_a, data_interface_b);
 
 	return ret;
 }
 
 /* This function must be called with mc_rwlock[node] taken in write mode */
-static starpu_mem_chunk_t _starpu_memchunk_cache_lookup_locked(uint32_t node, starpu_data_handle handle)
+static struct _starpu_mem_chunk *_starpu_memchunk_cache_lookup_locked(uint32_t node, starpu_data_handle_t handle)
 {
 	uint32_t footprint = _starpu_compute_data_footprint(handle);
 
 	/* go through all buffers in the cache */
-	starpu_mem_chunk_t mc;
-	for (mc = starpu_mem_chunk_list_begin(memchunk_cache[node]);
-	     mc != starpu_mem_chunk_list_end(memchunk_cache[node]);
-	     mc = starpu_mem_chunk_list_next(mc))
+	struct _starpu_mem_chunk *mc;
+	for (mc = _starpu_mem_chunk_list_begin(memchunk_cache[node]);
+	     mc != _starpu_mem_chunk_list_end(memchunk_cache[node]);
+	     mc = _starpu_mem_chunk_list_next(mc))
 	{
 		if (mc->footprint == footprint)
 		{
 			/* Is that a false hit ? (this is _very_ unlikely) */
-			if (_starpu_data_interface_compare(handle->per_node[node].interface, handle->ops, mc->interface, mc->ops))
+			if (_starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->chunk_interface, mc->ops))
 				continue;
 
 			/* Cache hit */
 
 			/* Remove from the cache */
-			starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
+			_starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
 			return mc;
 		}
 	}
@@ -438,33 +484,33 @@ static starpu_mem_chunk_t _starpu_memchunk_cache_lookup_locked(uint32_t node, st
 /* this function looks for a memory chunk that matches a given footprint in the
  * list of mem chunk that need to be freed. This function must be called with
  * mc_rwlock[node] taken in write mode. */
-static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle data, uint32_t footprint)
+static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
 {
-	starpu_mem_chunk_t mc, next_mc;
+	struct _starpu_mem_chunk *mc, *next_mc;
 
 	/* go through all buffers in the cache */
-	mc = _starpu_memchunk_cache_lookup_locked(node, handle);
+	mc = _starpu_memchunk_cache_lookup_locked(node, data);
 	if (mc)
 	{
 		/* We found an entry in the cache so we can reuse it */
-		reuse_mem_chunk(node, data, mc, 0);
+		reuse_mem_chunk(node, replicate, mc, 0);
 		return 1;
 	}
 
 	/* now look for some non essential data in the active list */
-	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
-	     mc != starpu_mem_chunk_list_end(mc_list[node]);
+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
 	     mc = next_mc)
 	{
 		/* there is a risk that the memory chunk is freed before next
 		 * iteration starts: so we compute the next element of the list
 		 * now */
-		next_mc = starpu_mem_chunk_list_next(mc);
+		next_mc = _starpu_mem_chunk_list_next(mc);
 
 		if (mc->data->is_not_important && (mc->footprint == footprint))
 		{
 //			fprintf(stderr, "found a candidate ...\n");
-			if (try_to_reuse_mem_chunk(mc, node, data, 1))
+			if (try_to_reuse_mem_chunk(mc, node, replicate, 1))
 				return 1;
 		}
 	}
@@ -477,24 +523,26 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
  * Free the memory chuncks that are explicitely tagged to be freed. The
  * mc_rwlock[node] rw-lock should be taken prior to calling this function.
  */
-static size_t flush_memchunk_cache(uint32_t node)
+static size_t flush_memchunk_cache(uint32_t node, size_t reclaim)
 {
-	starpu_mem_chunk_t mc, next_mc;
-	
+	struct _starpu_mem_chunk *mc, *next_mc;
+
 	size_t freed = 0;
 
-	for (mc = starpu_mem_chunk_list_begin(memchunk_cache[node]);
-	     mc != starpu_mem_chunk_list_end(memchunk_cache[node]);
+	for (mc = _starpu_mem_chunk_list_begin(memchunk_cache[node]);
+	     mc != _starpu_mem_chunk_list_end(memchunk_cache[node]);
 	     mc = next_mc)
 	{
-		next_mc = starpu_mem_chunk_list_next(mc);
+		next_mc = _starpu_mem_chunk_list_next(mc);
 
 		freed += free_memory_on_node(mc, node);
 
-		starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
+		_starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
 
 		free(mc->chunk_interface);
-		starpu_mem_chunk_delete(mc);
+		_starpu_mem_chunk_delete(mc);
+		if (reclaim && freed>reclaim)
+			break;
 	}
 
 	return freed;
@@ -506,30 +554,31 @@ static size_t flush_memchunk_cache(uint32_t node)
  * should only be used at the termination of StarPU for instance). The
  * mc_rwlock[node] rw-lock should be taken prior to calling this function.
  */
-static size_t free_potentially_in_use_mc(uint32_t node, unsigned force)
+static size_t free_potentially_in_use_mc(uint32_t node, unsigned force, size_t reclaim)
 {
 	size_t freed = 0;
 
-	starpu_mem_chunk_t mc, next_mc;
+	struct _starpu_mem_chunk *mc, *next_mc;
 
-	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
-	     mc != starpu_mem_chunk_list_end(mc_list[node]);
+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
 	     mc = next_mc)
 	{
-		/* there is a risk that the memory chunk is freed 
+		/* there is a risk that the memory chunk is freed
 		   before next iteration starts: so we compute the next
 		   element of the list now */
-		next_mc = starpu_mem_chunk_list_next(mc);
+		next_mc = _starpu_mem_chunk_list_next(mc);
 
 		if (!force)
 		{
 			freed += try_to_free_mem_chunk(mc, node);
-			#if 0
-			if (freed > toreclaim)
+			#if 1
+			if (reclaim && freed > reclaim)
 				break;
 			#endif
 		}
-		else {
+		else
+		{
 			/* We must free the memory now: note that data
 			 * coherency is not maintained in that case ! */
 			freed += do_free_mem_chunk(mc, node);
@@ -539,19 +588,22 @@ static size_t free_potentially_in_use_mc(uint32_t node, unsigned force)
 	return freed;
 }
 
-static size_t reclaim_memory_generic(uint32_t node, unsigned force)
+static size_t reclaim_memory_generic(uint32_t node, unsigned force, size_t reclaim)
 {
 	size_t freed = 0;
 
-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+
+	starpu_lru(node);
 
 	/* remove all buffers for which there was a removal request */
-	freed += flush_memchunk_cache(node);
+	freed += flush_memchunk_cache(node, reclaim);
 
 	/* try to free all allocated data potentially in use */
-	freed += free_potentially_in_use_mc(node, force);
+	if (reclaim && freed<reclaim)
+		freed += free_potentially_in_use_mc(node, force, reclaim);
 
-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
 
 	return freed;
 
@@ -564,13 +616,13 @@ static size_t reclaim_memory_generic(uint32_t node, unsigned force)
  */
 size_t _starpu_free_all_automatically_allocated_buffers(uint32_t node)
 {
-	return reclaim_memory_generic(node, 1);
+	return reclaim_memory_generic(node, 1, 0);
 }
 
-static starpu_mem_chunk_t _starpu_memchunk_init(struct starpu_data_replicate_s *replicate, size_t size, size_t interface_size, unsigned automatically_allocated)
+static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_replicate *replicate, size_t size, size_t interface_size, unsigned automatically_allocated)
 {
-	starpu_mem_chunk_t mc = starpu_mem_chunk_new();
-	starpu_data_handle handle = replicate->handle;
+	struct _starpu_mem_chunk *mc = _starpu_mem_chunk_new();
+	starpu_data_handle_t handle = replicate->handle;
 
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT(handle->ops);
@@ -581,8 +633,9 @@ static starpu_mem_chunk_t _starpu_memchunk_init(struct starpu_data_replicate_s *
 	mc->ops = handle->ops;
 	mc->data_was_deleted = 0;
 	mc->automatically_allocated = automatically_allocated;
-	mc->relaxed_coherency = replicate->relaxed_coherency;		
+	mc->relaxed_coherency = replicate->relaxed_coherency;
 	mc->replicate = replicate;
+	mc->replicate->mc = mc;
 
 	/* Save a copy of the interface */
 	mc->chunk_interface = malloc(interface_size);
@@ -592,49 +645,50 @@ static starpu_mem_chunk_t _starpu_memchunk_init(struct starpu_data_replicate_s *
 	return mc;
 }
 
-static void register_mem_chunk(struct starpu_data_replicate_s *replicate, size_t size, unsigned automatically_allocated)
+static void register_mem_chunk(struct _starpu_data_replicate *replicate, size_t size, unsigned automatically_allocated)
 {
 	unsigned dst_node = replicate->memory_node;
 
-	starpu_mem_chunk_t mc;
+	struct _starpu_mem_chunk *mc;
 
 	/* the interface was already filled by ops->allocate_data_on_node */
 	size_t interface_size = replicate->handle->ops->interface_size;
 
 	/* Put this memchunk in the list of memchunk in use */
-	mc = _starpu_memchunk_init(replicate, size, interface_size, automatically_allocated); 
+	mc = _starpu_memchunk_init(replicate, size, interface_size, automatically_allocated);
 
-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
 
-	starpu_mem_chunk_list_push_front(mc_list[dst_node], mc);
+	_starpu_mem_chunk_list_push_back(mc_list[dst_node], mc);
 
-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
 }
 
 /* This function is called when the handle is destroyed (eg. when calling
  * unregister or unpartition). It puts all the memchunks that refer to the
  * specified handle into the cache. */
-void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node)
+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node)
 {
-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
 
 	/* iterate over the list of memory chunks and remove the entry */
-	starpu_mem_chunk_t mc, next_mc;
-	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
-	     mc != starpu_mem_chunk_list_end(mc_list[node]);
+	struct _starpu_mem_chunk *mc, *next_mc;
+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
 	     mc = next_mc)
 	{
-		next_mc = starpu_mem_chunk_list_next(mc);
+		next_mc = _starpu_mem_chunk_list_next(mc);
 
-		if (mc->data == handle) {
+		if (mc->data == handle)
+		{
 			/* we found the data */
 			mc->data_was_deleted = 1;
 
 			/* remove it from the main list */
-			starpu_mem_chunk_list_erase(mc_list[node], mc);
+			_starpu_mem_chunk_list_erase(mc_list[node], mc);
 
 			/* put it in the list of buffers to be removed */
-			starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
+			_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
 
 			/* Note that we do not stop here because there can be
 			 * multiple replicates associated to the same handle on
@@ -643,7 +697,42 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node)
 	}
 
 	/* there was no corresponding buffer ... */
-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+}
+
+static size_t _starpu_get_global_mem_size(int dst_node)
+{
+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
+	size_t global_mem_size;
+
+	switch(kind)
+	{
+		case STARPU_CPU_RAM:
+#ifdef STARPU_DEVEL
+#warning to be fixed
+#endif
+			global_mem_size = 64*1024*1024;
+			break;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+		{
+			int devid = _starpu_memory_node_to_devid(dst_node);
+			global_mem_size = starpu_cuda_get_global_mem_size(devid);
+			break;
+		}
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+		{
+			int devid = _starpu_memory_node_to_devid(dst_node);
+			global_mem_size = starpu_opencl_get_global_mem_size(devid);
+			break;
+		}
+#endif
+		default:
+			STARPU_ASSERT(0);
+	}
+	return global_mem_size;
 }
 
 /*
@@ -658,7 +747,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node)
  *
  */
 
-static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, uint32_t dst_node)
+static ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t dst_node, unsigned is_prefetch)
 {
 	unsigned attempts = 0;
 	ssize_t allocated_memory;
@@ -669,64 +758,81 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct star
 	/* perhaps we can directly reuse a buffer in the free-list */
 	uint32_t footprint = _starpu_compute_data_footprint(handle);
 
-	STARPU_TRACE_START_ALLOC_REUSE(dst_node);
-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_STARPU_TRACE_START_ALLOC_REUSE(dst_node);
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
 
-	if (try_to_find_reusable_mem_chunk(dst_node, handle, footprint))
+	if (try_to_find_reusable_mem_chunk(dst_node, handle, replicate, footprint))
 	{
-		PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+		_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
 		_starpu_allocation_cache_hit(dst_node);
 		ssize_t data_size = _starpu_data_get_size(handle);
 		return data_size;
 	}
 
-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
-	STARPU_TRACE_END_ALLOC_REUSE(dst_node);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
+	_STARPU_TRACE_END_ALLOC_REUSE(dst_node);
 #endif
 
-	do {
+	do
+	{
 		STARPU_ASSERT(handle->ops);
 		STARPU_ASSERT(handle->ops->allocate_data_on_node);
 
-		STARPU_TRACE_START_ALLOC(dst_node);
+		_STARPU_TRACE_START_ALLOC(dst_node);
 		STARPU_ASSERT(replicate->data_interface);
 
 #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
-		if (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
+		if (starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 		{
 			/* To facilitate the design of interface, we set the
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the malloc method of each
 			 * interface. */
-			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(dst_node));
+			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(dst_node));
 			STARPU_ASSERT(err == cudaSuccess);
 		}
 #endif
 
 		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
-		STARPU_TRACE_END_ALLOC(dst_node);
+		_STARPU_TRACE_END_ALLOC(dst_node);
 
 		if (allocated_memory == -ENOMEM)
 		{
+			size_t reclaim = 0.25*_starpu_get_global_mem_size(dst_node);
+			if (starpu_memstrategy_data_size_coefficient*handle->data_size > reclaim)
+				reclaim = starpu_memstrategy_data_size_coefficient*handle->data_size;
+
+			/* Take temporary reference on the replicate */
 			replicate->refcnt++;
+			handle->busy_count++;
 			_starpu_spin_unlock(&handle->header_lock);
 
-			STARPU_TRACE_START_MEMRECLAIM(dst_node);
-			reclaim_memory_generic(dst_node, 0);
-			STARPU_TRACE_END_MEMRECLAIM(dst_node);
+			_STARPU_TRACE_START_MEMRECLAIM(dst_node);
+			if (is_prefetch) {
+				_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
+				flush_memchunk_cache(dst_node, reclaim);
+				_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
+			} else
+				reclaim_memory_generic(dst_node, 0, reclaim);
+			_STARPU_TRACE_END_MEMRECLAIM(dst_node);
 
 		        while (_starpu_spin_trylock(&handle->header_lock))
 		                _starpu_datawizard_progress(_starpu_get_local_memory_node(), 0);
-		
+
 			replicate->refcnt--;
+			STARPU_ASSERT(replicate->refcnt >= 0);
+			STARPU_ASSERT(handle->busy_count > 0);
+			handle->busy_count--;
+			_starpu_data_check_not_busy(handle);
 		}
-		
-	} while((allocated_memory == -ENOMEM) && attempts++ < 2);
+
+	}
+	while((allocated_memory == -ENOMEM) && attempts++ < 2);
 
 	return allocated_memory;
 }
 
-int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *replicate)
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
 {
 	ssize_t allocated_memory;
 
@@ -739,7 +845,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_dat
 		return 0;
 
 	STARPU_ASSERT(replicate->data_interface);
-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node);
+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
 
 	/* perhaps we could really not handle that capacity misses */
 	if (allocated_memory == -ENOMEM)
@@ -762,7 +868,74 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_dat
 	return 0;
 }
 
-unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node)
+unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, uint32_t memory_node)
 {
 	return handle->per_node[memory_node].allocated;
 }
+
+void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
+{
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&lru_rwlock[node]);
+	struct _starpu_mem_chunk_lru *mc_lru=_starpu_mem_chunk_lru_new();
+	mc_lru->mc=mc;
+	_starpu_mem_chunk_lru_list_push_front(starpu_lru_list[node],mc_lru);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&lru_rwlock[node]);
+}
+
+/* The mc_rwlock[node] rw-lock should be taken prior to calling this function.*/
+static void _starpu_memchunk_recently_used_move(struct _starpu_mem_chunk *mc, unsigned node)
+{
+	/* XXX Sometimes the memchunk is not in the list... */
+	struct _starpu_mem_chunk *mc_iter;
+	for (mc_iter = _starpu_mem_chunk_list_begin(mc_list[node]);
+	     mc_iter != _starpu_mem_chunk_list_end(mc_list[node]);
+	     mc_iter = _starpu_mem_chunk_list_next(mc_iter) )
+	{
+		if (mc_iter==mc)
+		{
+			_starpu_mem_chunk_list_erase(mc_list[node], mc);
+			_starpu_mem_chunk_list_push_back(mc_list[node], mc);
+			return;
+		}
+
+	}
+}
+
+static void starpu_lru(unsigned node)
+{
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&lru_rwlock[node]);
+	while (!_starpu_mem_chunk_lru_list_empty(starpu_lru_list[node]))
+	{
+		struct _starpu_mem_chunk_lru *mc_lru=_starpu_mem_chunk_lru_list_front(starpu_lru_list[node]);
+		_starpu_memchunk_recently_used_move(mc_lru->mc, node);
+		_starpu_mem_chunk_lru_list_erase(starpu_lru_list[node], mc_lru);
+		_starpu_mem_chunk_lru_delete(mc_lru);
+	}
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&lru_rwlock[node]);
+}
+
+
+#ifdef STARPU_MEMORY_STATUS
+void _starpu_display_data_stats_by_node(int node)
+{
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+
+	if (!_starpu_mem_chunk_list_empty(mc_list[node]))
+	{
+		fprintf(stderr, "#-------\n");
+		fprintf(stderr, "Data on Node #%d\n",node);
+
+		struct _starpu_mem_chunk *mc;
+
+		for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
+		     mc != _starpu_mem_chunk_list_end(mc_list[node]);
+		     mc = _starpu_mem_chunk_list_next(mc))
+		{
+			_starpu_display_data_handle_stats(mc->data);
+		}
+
+	}
+
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+}
+#endif

+ 16 - 8
src/datawizard/memalloc.h

@@ -26,21 +26,21 @@
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
 
-struct starpu_data_replicate_s;
+struct _starpu_data_replicate;
 
-LIST_TYPE(starpu_mem_chunk,
-	starpu_data_handle data;
+LIST_TYPE(_starpu_mem_chunk,
+	starpu_data_handle_t data;
 	size_t size;
 
 	uint32_t footprint;
-	
+
 	/* The footprint of the data is not sufficient to determine whether two
 	 * pieces of data have the same layout (there could be collision in the
 	 * hash function ...) so we still keep a copy of the actual layout (ie.
 	 * the data interface) to stay on the safe side. We make a copy of
 	 * because when a data is deleted, the memory chunk remains.
 	 */
-	struct starpu_data_interface_ops_t *ops;
+	struct starpu_data_interface_ops *ops;
 	void *chunk_interface;
 	unsigned automatically_allocated;
 	unsigned data_was_deleted;
@@ -48,12 +48,20 @@ LIST_TYPE(starpu_mem_chunk,
 	/* A buffer that is used for SCRATCH or reduction cannnot be used with
 	 * filters. */
 	unsigned relaxed_coherency;
-	struct starpu_data_replicate_s *replicate;
+	struct _starpu_data_replicate *replicate;
+)
+
+/* LRU list */
+LIST_TYPE(_starpu_mem_chunk_lru,
+	struct _starpu_mem_chunk *mc;
 )
 
 void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
-void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node);
-int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *replicate);
+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node);
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
 size_t _starpu_free_all_automatically_allocated_buffers(uint32_t node);
+void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
+
+void _starpu_display_data_stats_by_node(int node);
 #endif

+ 35 - 22
src/datawizard/memory_nodes.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,20 +23,23 @@
 #include "copy_driver.h"
 #include "memalloc.h"
 
-static starpu_mem_node_descr descr;
+static struct _starpu_mem_node_descr descr;
 static pthread_key_t memory_node_key;
 
 void _starpu_init_memory_nodes(void)
 {
-	/* there is no node yet, subsequent nodes will be 
+	/* there is no node yet, subsequent nodes will be
 	 * added using _starpu_register_memory_node */
 	descr.nnodes = 0;
 
 	pthread_key_create(&memory_node_key, NULL);
 
 	unsigned i;
-	for (i = 0; i < STARPU_MAXNODES; i++) 
-		descr.nodes[i] = STARPU_UNUSED; 
+	for (i = 0; i < STARPU_MAXNODES; i++)
+	{
+		descr.nodes[i] = STARPU_UNUSED;
+		descr.nworkers[i] = 0;
+	}
 
 	_starpu_init_mem_chunk_lists();
 	_starpu_init_data_request_lists();
@@ -62,8 +65,8 @@ unsigned _starpu_get_local_memory_node(void)
 {
 	unsigned *memory_node;
 	memory_node = (unsigned *) pthread_getspecific(memory_node_key);
-	
-	/* in case this is called by the programmer, we assume the RAM node 
+
+	/* in case this is called by the programmer, we assume the RAM node
 	   is the appropriate memory node ... so we return 0 XXX */
 	if (STARPU_UNLIKELY(!memory_node))
 		return 0;
@@ -71,34 +74,44 @@ unsigned _starpu_get_local_memory_node(void)
 	return *memory_node;
 }
 
-starpu_mem_node_descr *_starpu_get_memory_node_description(void)
+void _starpu_memory_node_worker_add(unsigned node)
+{
+	descr.nworkers[node]++;
+}
+
+unsigned _starpu_memory_node_workers(unsigned node)
+{
+	return descr.nworkers[node];
+}
+
+struct _starpu_mem_node_descr *_starpu_get_memory_node_description(void)
 {
 	return &descr;
 }
 
-starpu_node_kind _starpu_get_node_kind(uint32_t node)
+enum starpu_node_kind starpu_node_get_kind(uint32_t node)
 {
 	return descr.nodes[node];
 }
 
-int starpu_memory_node_to_devid(unsigned node)
+int _starpu_memory_node_to_devid(unsigned node)
 {
 	return descr.devid[node];
 }
 
-unsigned _starpu_get_memory_nodes_count(void)
+unsigned starpu_memory_nodes_get_count(void)
 {
 	return descr.nnodes;
 }
 
-unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid)
+unsigned _starpu_register_memory_node(enum starpu_node_kind kind, int devid)
 {
 	unsigned nnodes;
 	/* ATOMIC_ADD returns the new value ... */
 	nnodes = STARPU_ATOMIC_ADD(&descr.nnodes, 1);
 
 	descr.nodes[nnodes-1] = kind;
-	STARPU_TRACE_NEW_MEM_NODE(nnodes-1);
+	_STARPU_TRACE_NEW_MEM_NODE(nnodes-1);
 
 	descr.devid[nnodes-1] = devid;
 
@@ -115,8 +128,8 @@ void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_
 {
 	unsigned cond_id;
 	unsigned nconds_total, nconds;
-	
-	pthread_rwlock_wrlock(&descr.conditions_rwlock);
+
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&descr.conditions_rwlock);
 
 	/* we only insert the queue if it's not already in the list */
 	nconds = descr.condition_count[nodeid];
@@ -127,7 +140,7 @@ void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_
 			STARPU_ASSERT(descr.conditions_attached_to_node[nodeid][cond_id].mutex == mutex);
 
 			/* the condition is already in the list */
-			pthread_rwlock_unlock(&descr.conditions_rwlock);
+			_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr.conditions_rwlock);
 			return;
 		}
 	}
@@ -138,28 +151,28 @@ void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_
 	descr.condition_count[nodeid]++;
 
 	/* do we have to add it in the global list as well ? */
-	nconds_total = descr.total_condition_count; 
+	nconds_total = descr.total_condition_count;
 	for (cond_id = 0; cond_id < nconds_total; cond_id++)
 	{
 		if (descr.conditions_all[cond_id].cond == cond)
 		{
 			/* the queue is already in the global list */
-			pthread_rwlock_unlock(&descr.conditions_rwlock);
+			_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr.conditions_rwlock);
 			return;
 		}
-	} 
+	}
 
 	/* it was not in the global list either */
 	descr.conditions_all[nconds_total].cond = cond;
 	descr.conditions_all[nconds_total].mutex = mutex;
 	descr.total_condition_count++;
 
-	pthread_rwlock_unlock(&descr.conditions_rwlock);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr.conditions_rwlock);
 }
 
 unsigned starpu_worker_get_memory_node(unsigned workerid)
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 	/* This workerid may either be a basic worker or a combined worker */
 	unsigned nworkers = config->topology.nworkers;

+ 20 - 24
src/datawizard/memory_nodes.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,59 +23,55 @@
 #include <datawizard/coherency.h>
 #include <datawizard/memalloc.h>
 
-typedef enum {
-	STARPU_UNUSED     = 0x00,
-	STARPU_CPU_RAM    = 0x01,
-	STARPU_CUDA_RAM   = 0x02,
-	STARPU_OPENCL_RAM = 0x03,
-	STARPU_SPU_LS     = 0x04
-} starpu_node_kind;
-
-typedef starpu_node_kind starpu_memory_node_tuple;
 
 #define _STARPU_MEMORY_NODE_TUPLE(node1,node2) (node1 | (node2 << 4))
 #define _STARPU_MEMORY_NODE_TUPLE_FIRST(tuple) (tuple & 0x0F)
 #define _STARPU_MEMORY_NODE_TUPLE_SECOND(tuple) (tuple & 0xF0)
 
-struct _cond_and_mutex {
+struct _starpu_cond_and_mutex
+{
         pthread_cond_t *cond;
-        pthread_mutex_t *mutex;	
+        pthread_mutex_t *mutex;
 };
 
-typedef struct {
+struct _starpu_mem_node_descr
+{
 	unsigned nnodes;
-	starpu_node_kind nodes[STARPU_MAXNODES];
+	enum starpu_node_kind nodes[STARPU_MAXNODES];
 
 	/* Get the device id associated to this node, or -1 if not applicable */
 	int devid[STARPU_MAXNODES];
 
-	// TODO move this 2 lists outside starpu_mem_node_descr
+	unsigned nworkers[STARPU_MAXNODES];
+
+	// TODO move this 2 lists outside struct _starpu_mem_node_descr
 	/* Every worker is associated to a condition variable on which the
 	 * worker waits when there is task available. It is possible that
 	 * multiple worker share the same condition variable, so we maintain a
 	 * list of all these condition variables so that we can wake up all
 	 * worker attached to a memory node that are waiting on a task. */
 	pthread_rwlock_t conditions_rwlock;
-	struct _cond_and_mutex conditions_attached_to_node[STARPU_MAXNODES][STARPU_NMAXWORKERS];
-	struct _cond_and_mutex conditions_all[STARPU_MAXNODES*STARPU_NMAXWORKERS];
+	struct _starpu_cond_and_mutex conditions_attached_to_node[STARPU_MAXNODES][STARPU_NMAXWORKERS];
+	struct _starpu_cond_and_mutex conditions_all[STARPU_MAXNODES*STARPU_NMAXWORKERS];
 	/* the number of queues attached to each node */
 	unsigned total_condition_count;
 	unsigned condition_count[STARPU_MAXNODES];
 
-} starpu_mem_node_descr;
+};
 
 void _starpu_init_memory_nodes(void);
 void _starpu_deinit_memory_nodes(void);
 void _starpu_set_local_memory_node_key(unsigned *node);
 unsigned _starpu_get_local_memory_node(void);
-unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid);
+void _starpu_memory_node_worker_add(unsigned node);
+unsigned _starpu_memory_node_workers(unsigned node);
+unsigned _starpu_register_memory_node(enum starpu_node_kind kind, int devid);
 //void _starpu_memory_node_attach_queue(struct starpu_jobq_s *q, unsigned nodeid);
 void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_t *mutex, unsigned memory_node);
 
-starpu_node_kind _starpu_get_node_kind(uint32_t node);
-int starpu_memory_node_to_devid(unsigned node);
-unsigned _starpu_get_memory_nodes_count(void);
+enum starpu_node_kind _starpu_node_get_kind(uint32_t node);
+int _starpu_memory_node_to_devid(unsigned node);
 
-starpu_mem_node_descr *_starpu_get_memory_node_description(void);
+struct _starpu_mem_node_descr *_starpu_get_memory_node_description(void);
 
 #endif // __MEMORY_NODES_H__

+ 3 - 2
src/datawizard/progress.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,6 +25,7 @@ void _starpu_datawizard_progress(uint32_t memory_node, unsigned may_alloc)
 	/* in case some other driver requested data */
 	_starpu_handle_pending_node_data_requests(memory_node);
 	_starpu_handle_node_data_requests(memory_node, may_alloc);
-
+	_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
 	_starpu_execute_registered_progression_hooks();
 }
+

+ 155 - 82
src/datawizard/reduction.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,20 +17,24 @@
 
 #include <starpu.h>
 #include <common/utils.h>
+#include <util/starpu_data_cpy.h>
 #include <core/task.h>
 #include <datawizard/datawizard.h>
 
-void starpu_data_set_reduction_methods(starpu_data_handle handle,
-					struct starpu_codelet_t *redux_cl,
-					struct starpu_codelet_t *init_cl)
+void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
+				       struct starpu_codelet *redux_cl,
+				       struct starpu_codelet *init_cl)
 {
 	_starpu_spin_lock(&handle->header_lock);
 
+	_starpu_codelet_check_deprecated_fields(redux_cl);
+	_starpu_codelet_check_deprecated_fields(init_cl);
+
 	unsigned child;
 	for (child = 0; child < handle->nchildren; child++)
 	{
 		/* make sure that the flags are applied to the children as well */
-		struct starpu_data_state_t *child_handle = &handle->children[child];
+		struct _starpu_data_state *child_handle = &handle->children[child];
 		if (child_handle->nchildren > 0)
 			starpu_data_set_reduction_methods(child_handle, redux_cl, init_cl);
 	}
@@ -41,27 +45,28 @@ void starpu_data_set_reduction_methods(starpu_data_handle handle,
 	_starpu_spin_unlock(&handle->header_lock);
 }
 
-void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, int workerid)
+void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, int workerid)
 {
 	STARPU_ASSERT(replicate);
 	STARPU_ASSERT(replicate->allocated);
 
-	struct starpu_codelet_t *init_cl = handle->init_cl;
+	struct starpu_codelet *init_cl = handle->init_cl;
 	STARPU_ASSERT(init_cl);
 
-	cl_func init_func = NULL;
-	
+	_starpu_cl_func_t init_func = NULL;
+
 	/* TODO Check that worker may execute the codelet */
 
-	switch (starpu_worker_get_type(workerid)) {
+	switch (starpu_worker_get_type(workerid))
+	{
 		case STARPU_CPU_WORKER:
-			init_func = init_cl->cpu_func;
+			init_func = _starpu_task_get_cpu_nth_implementation(init_cl, 0);
 			break;
 		case STARPU_CUDA_WORKER:
-			init_func = init_cl->cuda_func;
+			init_func = _starpu_task_get_cuda_nth_implementation(init_cl, 0);
 			break;
 		case STARPU_OPENCL_WORKER:
-			init_func = init_cl->opencl_func;
+			init_func = _starpu_task_get_opencl_nth_implementation(init_cl, 0);
 			break;
 		default:
 			STARPU_ABORT();
@@ -77,7 +82,7 @@ void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_
 
 /* Enable reduction mode. This function must be called with the header lock
  * taken. */
-void starpu_data_start_reduction_mode(starpu_data_handle handle)
+void _starpu_data_start_reduction_mode(starpu_data_handle_t handle)
 {
 	STARPU_ASSERT(handle->reduction_refcnt == 0);
 
@@ -86,22 +91,40 @@ void starpu_data_start_reduction_mode(starpu_data_handle handle)
 	unsigned nworkers = starpu_worker_get_count();
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		struct starpu_data_replicate_s *replicate;
+		struct _starpu_data_replicate *replicate;
 		replicate = &handle->per_worker[worker];
 		replicate->initialized = 0;
+		replicate->relaxed_coherency = 2;
+		if (replicate->mc)
+			replicate->mc->relaxed_coherency = 2;
 	}
 }
 
 //#define NO_TREE_REDUCTION
 
 /* Force reduction. The lock should already have been taken.  */
-void starpu_data_end_reduction_mode(starpu_data_handle handle)
+void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 {
 	unsigned worker;
+	unsigned node;
+	unsigned empty; /* Whether the handle is initially unallocated */
 
 	/* Put every valid replicate in the same array */
 	unsigned replicate_count = 0;
-	starpu_data_handle replicate_array[STARPU_NMAXWORKERS];
+	starpu_data_handle_t replicate_array[1 + STARPU_NMAXWORKERS];
+
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		if (handle->per_node[node].state != STARPU_INVALID)
+			break;
+	}
+	empty = node == STARPU_MAXNODES;
+
+#ifndef NO_TREE_REDUCTION
+	if (!empty)
+		/* Include the initial value into the reduction tree */
+		replicate_array[replicate_count++] = handle;
+#endif
 
 	/* Register all valid per-worker replicates */
 	unsigned nworkers = starpu_worker_get_count();
@@ -112,7 +135,7 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 			/* Make sure the replicate is not removed */
 			handle->per_worker[worker].refcnt++;
 
-			uint32_t home_node = starpu_worker_get_memory_node(worker); 
+			uint32_t home_node = starpu_worker_get_memory_node(worker);
 			starpu_data_register(&handle->reduction_tmp_handles[worker],
 				home_node, handle->per_worker[worker].data_interface, handle->ops);
 
@@ -120,21 +143,40 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 
 			replicate_array[replicate_count++] = handle->reduction_tmp_handles[worker];
 		}
-		else {
+		else
+		{
 			handle->reduction_tmp_handles[worker] = NULL;
 		}
 	}
 
 #ifndef NO_TREE_REDUCTION
-	handle->reduction_refcnt = 1;
+	if (empty) {
+		/* Only the final copy will touch the actual handle */
+		handle->reduction_refcnt = 1;
+	} else {
+		unsigned step = 1;
+		handle->reduction_refcnt = 0;
+		while (step < replicate_count)
+		{
+			/* Each stage will touch the actual handle */
+			handle->reduction_refcnt++;
+			step *= 2;
+		}
+	}
 #else
 	/* We know that in this reduction algorithm there is exactly one task per valid replicate. */
-	handle->reduction_refcnt = replicate_count;
+	handle->reduction_refcnt = replicate_count + empty;
 #endif
 
 //	fprintf(stderr, "REDUX REFCNT = %d\n", handle->reduction_refcnt);
-	
-	if (replicate_count > 0)
+
+	if (replicate_count >
+#ifndef NO_TREE_REDUCTION
+			!empty
+#else
+			0
+#endif
+			)
 	{
 		/* Temporarily unlock the handle */
 		_starpu_spin_unlock(&handle->header_lock);
@@ -144,9 +186,13 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 		 * replicate */
 		struct starpu_task *last_replicate_deps[replicate_count];
 		memset(last_replicate_deps, 0, replicate_count*sizeof(struct starpu_task *));
-	
-		unsigned step = 1;
-		while (step <= replicate_count)
+		struct starpu_task *redux_tasks[replicate_count];
+
+		/* Redux step-by-step for step from 1 to replicate_count/2, i.e.
+		 * 1-by-1, then 2-by-2, then 4-by-4, etc. */
+		unsigned step;
+		unsigned redux_task_idx = 0;
+		for (step = 1; step < replicate_count; step *=2)
 		{
 			unsigned i;
 			for (i = 0; i < replicate_count; i+=2*step)
@@ -156,90 +202,108 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 					/* Perform the reduction between replicates i
 					 * and i+step and put the result in replicate i */
 					struct starpu_task *redux_task = starpu_task_create();
-		
+
+					/* Mark these tasks so that StarPU does not block them
+					 * when they try to access the handle (normal tasks are
+					 * data requests to that handle are frozen until the
+					 * data is coherent again). */
+					struct _starpu_job *j = _starpu_get_job_associated_to_task(redux_task);
+					j->reduction_task = 1;
+
 					redux_task->cl = handle->redux_cl;
 					STARPU_ASSERT(redux_task->cl);
-		
-					redux_task->buffers[0].handle = replicate_array[i];
-					redux_task->buffers[0].mode = STARPU_RW;
-		
-					redux_task->buffers[1].handle = replicate_array[i+step];
-					redux_task->buffers[1].mode = STARPU_R;
-	
-					redux_task->detach = 0;
-	
+
+					redux_task->handles[0] = replicate_array[i];
+					redux_task->cl->modes[0] = STARPU_RW;
+
+					redux_task->handles[1] = replicate_array[i+step];
+					redux_task->cl->modes[1] = STARPU_R;
+
 					int ndeps = 0;
 					struct starpu_task *task_deps[2];
-	
+
 					if (last_replicate_deps[i])
 						task_deps[ndeps++] = last_replicate_deps[i];
-	
+
 					if (last_replicate_deps[i+step])
 						task_deps[ndeps++] = last_replicate_deps[i+step];
-	
+
 					/* i depends on this task */
 					last_replicate_deps[i] = redux_task;
-	
+
 					/* we don't perform the reduction until both replicates are ready */
-					starpu_task_declare_deps_array(redux_task, ndeps, task_deps); 
-		
-					int ret = _starpu_task_submit_internal(redux_task);
-					STARPU_ASSERT(!ret);
-		
+					starpu_task_declare_deps_array(redux_task, ndeps, task_deps);
+
+					/* We cannot submit tasks here : we do
+					 * not want to depend on tasks that have
+					 * been completed, so we juste store
+					 * this task : it will be submitted
+					 * later. */
+					redux_tasks[redux_task_idx++] = redux_task;
 				}
 			}
-
-			step *= 2;
 		}
-	
-		struct starpu_task *redux_task = starpu_task_create();
-
-		/* Mark these tasks so that StarPU does not block them
-		 * when they try to access the handle (normal tasks are
-		 * data requests to that handle are frozen until the
-		 * data is coherent again). */
-		starpu_job_t j = _starpu_get_job_associated_to_task(redux_task);
-		j->reduction_task = 1;
 
-		redux_task->cl = handle->redux_cl;
-		STARPU_ASSERT(redux_task->cl);
+		if (empty)
+			/* The handle was empty, we just need to copy the reduced value. */
+			_starpu_data_cpy(handle, replicate_array[0], 1, NULL, 0, 1, last_replicate_deps[0]);
 
-		redux_task->buffers[0].handle = handle;
-		redux_task->buffers[0].mode = STARPU_RW;
+		/* Let's submit all the reduction tasks. */
+		unsigned i;
+		for (i = 0; i < redux_task_idx; i++)
+		{
+			int ret = starpu_task_submit(redux_tasks[i]);
+			STARPU_ASSERT(ret == 0);
+		}
+#else
+		if (empty) {
+			struct starpu_task *redux_task = starpu_task_create();
 
-		redux_task->buffers[1].handle = replicate_array[0];
-		redux_task->buffers[1].mode = STARPU_R;
+			/* Mark these tasks so that StarPU does not block them
+			 * when they try to access the handle (normal tasks are
+			 * data requests to that handle are frozen until the
+			 * data is coherent again). */
+			struct _starpu_job *j = _starpu_get_job_associated_to_task(redux_task);
+			j->reduction_task = 1;
 
-		if (last_replicate_deps[0])
-			starpu_task_declare_deps_array(redux_task, 1, &last_replicate_deps[0]);
+			redux_task->cl = handle->init_cl;
+			STARPU_ASSERT(redux_task->cl);
+#ifdef STARPU_DEVEL
+#  warning the mode should already be set in the codelet. Only check they are valid?
+#endif
+			redux_task->cl->modes[0] = STARPU_W;
+			redux_task->handles[0] = handle;
 
-		int ret = _starpu_task_submit_internal(redux_task);
-		STARPU_ASSERT(!ret);
+			int ret = starpu_task_submit(redux_task);
+			STARPU_ASSERT(!ret);
+		}
 
-#else
 		/* Create a set of tasks to perform the reduction */
 		unsigned replicate;
 		for (replicate = 0; replicate < replicate_count; replicate++)
 		{
 			struct starpu_task *redux_task = starpu_task_create();
-	
+
 			/* Mark these tasks so that StarPU does not block them
 			 * when they try to access the handle (normal tasks are
 			 * data requests to that handle are frozen until the
 			 * data is coherent again). */
-			starpu_job_t j = _starpu_get_job_associated_to_task(redux_task);
+			struct _starpu_job *j = _starpu_get_job_associated_to_task(redux_task);
 			j->reduction_task = 1;
-	
+
 			redux_task->cl = handle->redux_cl;
 			STARPU_ASSERT(redux_task->cl);
-	
-			redux_task->buffers[0].handle = handle;
-			redux_task->buffers[0].mode = STARPU_RW;
-	
-			redux_task->buffers[1].handle = replicate_array[replicate];
-			redux_task->buffers[1].mode = STARPU_R;
-	
-			int ret = _starpu_task_submit_internal(redux_task);
+
+#ifdef STARPU_DEVEL
+#  warning the modes should already be set in the codelet. Only check they are valid?
+#endif
+			redux_task->cl->modes[0] = STARPU_RW;
+			redux_task->cl->modes[1] = STARPU_R;
+
+			redux_task->handles[0] = handle;
+			redux_task->handles[1] = replicate_array[replicate];
+
+			int ret = starpu_task_submit(redux_task);
 			STARPU_ASSERT(!ret);
 		}
 #endif
@@ -247,17 +311,26 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 	_starpu_spin_lock(&handle->header_lock);
 
 	}
+
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		struct _starpu_data_replicate *replicate;
+		replicate = &handle->per_worker[worker];
+		replicate->relaxed_coherency = 1;
+		if (replicate->mc)
+			replicate->mc->relaxed_coherency = 1;
+	}
 }
 
-void starpu_data_end_reduction_mode_terminate(starpu_data_handle handle)
+void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 {
 	unsigned nworkers = starpu_worker_get_count();
 
-//	fprintf(stderr, "starpu_data_end_reduction_mode_terminate\n");
+//	fprintf(stderr, "_starpu_data_end_reduction_mode_terminate\n");
 	unsigned worker;
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		struct starpu_data_replicate_s *replicate;
+		struct _starpu_data_replicate *replicate;
 		replicate = &handle->per_worker[worker];
 		replicate->initialized = 0;
 

+ 33 - 12
src/datawizard/sort_data_handles.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,10 +26,10 @@
  * them in order, so that we need a total order over data. We must also not
  * lock a child before its parent. */
 
-static void find_data_path(struct starpu_data_state_t *data, unsigned path[])
+static void find_data_path(struct _starpu_data_state *data, unsigned path[])
 {
 	unsigned depth = data->depth;
-	struct starpu_data_state_t *current = data;
+	struct _starpu_data_state *current = data;
 
 	/* Compute the path from the root to the data */
 	unsigned level; /* level is the distance between the node and the current node */
@@ -39,7 +39,7 @@ static void find_data_path(struct starpu_data_state_t *data, unsigned path[])
 		path[depth - level - 1] = current->sibling_index;
 		current = data->father_handle;
 	}
-} 
+}
 
 static int _compar_data_paths(const unsigned pathA[], unsigned depthA,
 				const unsigned pathB[], unsigned depthB)
@@ -64,12 +64,33 @@ static int _compar_data_paths(const unsigned pathA[], unsigned depthA,
 
 /* A comparision function between two handles makes it possible to use qsort to
  * sort a list of handles */
-static int _starpu_compar_handles(struct starpu_data_state_t *dataA,
-				struct starpu_data_state_t *dataB)
+static int _starpu_compar_handles(const struct starpu_buffer_descr *descrA,
+				  const struct starpu_buffer_descr *descrB)
 {
+	struct _starpu_data_state *dataA = descrA->handle;
+	struct _starpu_data_state *dataB = descrB->handle;
+
 	/* Perhaps we have the same piece of data */
 	if (dataA == dataB)
-		return 0;
+	{
+		/* Process write requests first, this is needed for proper
+		 * locking, see _submit_job_enforce_data_deps,
+		 * _starpu_fetch_task_input, and _starpu_push_task_output  */
+		if (descrA->mode & STARPU_W)
+		{
+			if (descrB->mode & STARPU_W)
+				/* Both A and B write, take the reader first */
+				if (descrA->mode & STARPU_R)
+					return -1;
+				else
+					return 1;
+			else
+				/* Only A writes, take it first */
+				return -1;
+		} else
+			/* A doesn't write, take B before */
+			return 1;
+	}
 
 	/* In case we have data/subdata from different trees */
 	if (dataA->root_handle != dataB->root_handle)
@@ -88,14 +109,14 @@ static int _starpu_compar_handles(struct starpu_data_state_t *dataA,
 
 static int _starpu_compar_buffer_descr(const void *_descrA, const void *_descrB)
 {
-	const starpu_buffer_descr *descrA = (const starpu_buffer_descr *) _descrA;
-	const starpu_buffer_descr *descrB = (const starpu_buffer_descr *) _descrB;
+	const struct starpu_buffer_descr *descrA = (const struct starpu_buffer_descr *) _descrA;
+	const struct starpu_buffer_descr *descrB = (const struct starpu_buffer_descr *) _descrB;
 
-	return _starpu_compar_handles(descrA->handle, descrB->handle);
+	return _starpu_compar_handles(descrA, descrB);
 }
 
 /* The descr array will be overwritten, so this must be a copy ! */
-void _starpu_sort_task_handles(starpu_buffer_descr descr[], unsigned nbuffers)
+void _starpu_sort_task_handles(struct starpu_buffer_descr descr[], unsigned nbuffers)
 {
-	qsort(descr, nbuffers, sizeof(starpu_buffer_descr), _starpu_compar_buffer_descr);
+	qsort(descr, nbuffers, sizeof(struct starpu_buffer_descr), _starpu_compar_buffer_descr);
 }

+ 1 - 1
src/datawizard/sort_data_handles.h

@@ -29,6 +29,6 @@
 /* To avoid deadlocks, we reorder the different buffers accessed to by the task
  * so that we always grab the rw-lock associated to the handles in the same
  * order. */
-void _starpu_sort_task_handles(starpu_buffer_descr descr[], unsigned nbuffers);
+void _starpu_sort_task_handles(struct starpu_buffer_descr descr[], unsigned nbuffers);
 
 #endif // SORT_DATA_HANDLES

+ 143 - 99
src/datawizard/user_interactions.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,16 +22,17 @@
 #include <datawizard/copy_driver.h>
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
+#include <core/sched_policy.h>
 
 /* Explicitly ask StarPU to allocate room for a piece of data on the specified
  * memory node. */
-int starpu_data_request_allocation(starpu_data_handle handle, uint32_t node)
+int starpu_data_request_allocation(starpu_data_handle_t handle, uint32_t node)
 {
-	starpu_data_request_t r;
+	struct _starpu_data_request *r;
 
 	STARPU_ASSERT(handle);
 
-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, 0, 0);
+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 0);
 
 	/* we do not increase the refcnt associated to the request since we are
 	 * not waiting for its termination */
@@ -41,9 +42,10 @@ int starpu_data_request_allocation(starpu_data_handle handle, uint32_t node)
 	return 0;
 }
 
-struct user_interaction_wrapper {
-	starpu_data_handle handle;
-	starpu_access_mode mode;
+struct user_interaction_wrapper
+{
+	starpu_data_handle_t handle;
+	enum starpu_access_mode mode;
 	unsigned node;
 	pthread_cond_t cond;
 	pthread_mutex_t lock;
@@ -63,7 +65,7 @@ struct user_interaction_wrapper {
 static void _starpu_data_acquire_fetch_data_callback(void *arg)
 {
 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
-	starpu_data_handle handle = wrapper->handle;
+	starpu_data_handle_t handle = wrapper->handle;
 
 	/* At that moment, the caller holds a reference to the piece of data.
 	 * We enqueue the "post" sync task in the list associated to the handle
@@ -81,14 +83,14 @@ static void _starpu_data_acquire_continuation_non_blocking(void *arg)
 	int ret;
 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
 
-	starpu_data_handle handle = wrapper->handle;
+	starpu_data_handle_t handle = wrapper->handle;
 
 	STARPU_ASSERT(handle);
 
-	struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
+	struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
 
-	ret = _starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 1,
-			_starpu_data_acquire_fetch_data_callback, wrapper);
+	ret = _starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, 1,
+					 _starpu_data_acquire_fetch_data_callback, wrapper);
 	STARPU_ASSERT(!ret);
 }
 
@@ -108,10 +110,11 @@ static void starpu_data_acquire_cb_pre_sync_callback(void *arg)
 }
 
 /* The data must be released by calling starpu_data_release later on */
-int starpu_data_acquire_cb(starpu_data_handle handle,
-		starpu_access_mode mode, void (*callback)(void *), void *arg)
+int starpu_data_acquire_cb(starpu_data_handle_t handle,
+			   enum starpu_access_mode mode, void (*callback)(void *), void *arg)
 {
 	STARPU_ASSERT(handle);
+	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data is not possible");
         _STARPU_LOG_IN();
 
 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) malloc(sizeof(struct user_interaction_wrapper));
@@ -121,21 +124,15 @@ int starpu_data_acquire_cb(starpu_data_handle handle,
 	wrapper->mode = mode;
 	wrapper->callback = callback;
 	wrapper->callback_arg = arg;
-	PTHREAD_COND_INIT(&wrapper->cond, NULL);
-	PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
+	_STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
 	wrapper->finished = 0;
 
-#ifdef STARPU_DEVEL
-#warning TODO instead of having the is_prefetch argument, _starpu_fetch_data shoud consider two flags: async and detached
-#endif
-	_starpu_spin_lock(&handle->header_lock);
-	handle->per_node[0].refcnt++;
-	_starpu_spin_unlock(&handle->header_lock);
-
-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int sequential_consistency = handle->sequential_consistency;
 	if (sequential_consistency)
 	{
+		struct starpu_task *new_task;
 		wrapper->pre_sync_task = starpu_task_create();
 		wrapper->pre_sync_task->detach = 1;
 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
@@ -145,21 +142,27 @@ int starpu_data_acquire_cb(starpu_data_handle handle,
 		wrapper->post_sync_task->detach = 1;
 
 #ifdef STARPU_USE_FXT
-                starpu_job_t job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
+                struct _starpu_job *job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
                 job->model_name = "acquire_cb_pre";
                 job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
                 job->model_name = "acquire_cb_post";
 #endif
 
-		_starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+
+		if (new_task) {
+			int ret = starpu_task_submit(new_task);
+			STARPU_ASSERT(!ret);
+		}
 
 		/* TODO detect if this is superflous */
-		int ret = _starpu_task_submit_internal(wrapper->pre_sync_task);
+		int ret = starpu_task_submit(wrapper->pre_sync_task);
 		STARPU_ASSERT(!ret);
 	}
-	else {
-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+	else
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		starpu_data_acquire_cb_pre_sync_callback(wrapper);
 	}
@@ -175,33 +178,48 @@ static inline void _starpu_data_acquire_continuation(void *arg)
 {
 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
 
-	starpu_data_handle handle = wrapper->handle;
+	starpu_data_handle_t handle = wrapper->handle;
 
 	STARPU_ASSERT(handle);
 
-	struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
+	struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
+
+	_starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, 0, NULL, NULL);
 
-	_starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, NULL, NULL);
-	
 	/* continuation of starpu_data_acquire */
-	PTHREAD_MUTEX_LOCK(&wrapper->lock);
+	_STARPU_PTHREAD_MUTEX_LOCK(&wrapper->lock);
 	wrapper->finished = 1;
-	PTHREAD_COND_SIGNAL(&wrapper->cond);
-	PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
+	_STARPU_PTHREAD_COND_SIGNAL(&wrapper->cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
 }
 
 /* The data must be released by calling starpu_data_release later on */
-int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
+int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mode)
 {
 	STARPU_ASSERT(handle);
+	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data is not possible");
         _STARPU_LOG_IN();
 
-	/* it is forbidden to call this function from a callback or a codelet */
-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) {
+	/* unless asynchronous, it is forbidden to call this function from a callback or a codelet */
+	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+	{
                 _STARPU_LOG_OUT_TAG("EDEADLK");
 		return -EDEADLK;
         }
 
+	if (_starpu_data_is_multiformat_handle(handle) &&
+	    _starpu_handle_needs_conversion_task(handle, 0))
+	{
+		struct starpu_task *task = _starpu_create_conversion_task(handle, 0);
+		int ret;
+		handle->refcnt--;
+		handle->busy_count--;
+		handle->mf_node = 0;
+		task->synchronous = 1;
+		ret = starpu_task_submit(task);
+		STARPU_ASSERT(!ret);
+	}
+
 	struct user_interaction_wrapper wrapper =
 	{
 		.handle = handle,
@@ -213,10 +231,11 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 	};
 
 //	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int sequential_consistency = handle->sequential_consistency;
 	if (sequential_consistency)
 	{
+		struct starpu_task *new_task;
 		wrapper.pre_sync_task = starpu_task_create();
 		wrapper.pre_sync_task->detach = 0;
 
@@ -224,23 +243,27 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 		wrapper.post_sync_task->detach = 1;
 
 #ifdef STARPU_USE_FXT
-                starpu_job_t job = _starpu_get_job_associated_to_task(wrapper.pre_sync_task);
+                struct _starpu_job *job = _starpu_get_job_associated_to_task(wrapper.pre_sync_task);
                 job->model_name = "acquire_pre";
                 job = _starpu_get_job_associated_to_task(wrapper.post_sync_task);
                 job->model_name = "acquire_post";
 #endif
 
-		_starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+		if (new_task) {
+			int ret = starpu_task_submit(new_task);
+			STARPU_ASSERT(!ret);
+		}
 
 		/* TODO detect if this is superflous */
 		wrapper.pre_sync_task->synchronous = 1;
-		int ret = _starpu_task_submit_internal(wrapper.pre_sync_task);
+		int ret = starpu_task_submit(wrapper.pre_sync_task);
 		STARPU_ASSERT(!ret);
-		//starpu_task_wait(wrapper.pre_sync_task);
 	}
-	else {
-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+	else
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 	}
 
 	/* we try to get the data, if we do not succeed immediately, we set a
@@ -249,15 +272,16 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _starpu_data_acquire_continuation, &wrapper))
 	{
 		/* no one has locked this data yet, so we proceed immediately */
-		struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
-		int ret = _starpu_fetch_data_on_node(handle, ram_replicate, mode, 0, NULL, NULL);
+		struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
+		int ret = _starpu_fetch_data_on_node(handle, ram_replicate, mode, 0, 0, NULL, NULL);
 		STARPU_ASSERT(!ret);
 	}
-	else {
-		PTHREAD_MUTEX_LOCK(&wrapper.lock);
+	else
+	{
+		_STARPU_PTHREAD_MUTEX_LOCK(&wrapper.lock);
 		while (!wrapper.finished)
-			PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
-		PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
+			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
 	}
 
 	/* At that moment, the caller holds a reference to the piece of data.
@@ -272,7 +296,7 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 
 /* This function must be called after starpu_data_acquire so that the
  * application release the data */
-void starpu_data_release(starpu_data_handle handle)
+void starpu_data_release(starpu_data_handle_t handle)
 {
 	STARPU_ASSERT(handle);
 
@@ -286,71 +310,91 @@ void starpu_data_release(starpu_data_handle handle)
 static void _prefetch_data_on_node(void *arg)
 {
 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
-	starpu_data_handle handle = wrapper->handle;
+	starpu_data_handle_t handle = wrapper->handle;
         int ret;
 
-	struct starpu_data_replicate_s *replicate = &handle->per_node[wrapper->node];
-	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, wrapper->async, NULL, NULL);
+	struct _starpu_data_replicate *replicate = &handle->per_node[wrapper->node];
+	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, wrapper->async, wrapper->async, NULL, NULL);
         STARPU_ASSERT(!ret);
 
-        PTHREAD_MUTEX_LOCK(&wrapper->lock);
-	wrapper->finished = 1;
-	PTHREAD_COND_SIGNAL(&wrapper->cond);
-	PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
-
-	if (!wrapper->async)
-	{
-		_starpu_spin_lock(&handle->header_lock);
-		_starpu_notify_data_dependencies(handle);
-		_starpu_spin_unlock(&handle->header_lock);
+	if (wrapper->async)
+		free(wrapper);
+	else {
+		_STARPU_PTHREAD_MUTEX_LOCK(&wrapper->lock);
+		wrapper->finished = 1;
+		_STARPU_PTHREAD_COND_SIGNAL(&wrapper->cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
 	}
 
+	_starpu_spin_lock(&handle->header_lock);
+	_starpu_notify_data_dependencies(handle);
+	_starpu_spin_unlock(&handle->header_lock);
 }
 
 static
-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode)
+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_access_mode mode)
 {
 	STARPU_ASSERT(handle);
 
 	/* it is forbidden to call this function from a callback or a codelet */
-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+	if (STARPU_UNLIKELY(!async && !_starpu_worker_may_perform_blocking_calls()))
 		return -EDEADLK;
 
-	struct user_interaction_wrapper wrapper =
-	{
-		.handle = handle,
-		.node = node,
-		.async = async,
-		.cond = PTHREAD_COND_INITIALIZER,
-		.lock = PTHREAD_MUTEX_INITIALIZER,
-		.finished = 0
-	};
+	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) malloc(sizeof(*wrapper));
+
+	wrapper->handle = handle;
+	wrapper->node = node;
+	wrapper->async = async;
+	_STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
+	wrapper->finished = 0;
 
-	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, &wrapper))
+	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, wrapper))
 	{
 		/* we can immediately proceed */
-		struct starpu_data_replicate_s *replicate = &handle->per_node[node];
-		_starpu_fetch_data_on_node(handle, replicate, mode, async, NULL, NULL);
+		struct _starpu_data_replicate *replicate = &handle->per_node[node];
+
+		free(wrapper);
+
+		_starpu_fetch_data_on_node(handle, replicate, mode, async, async, NULL, NULL);
 
 		/* remove the "lock"/reference */
+
+		_starpu_spin_lock(&handle->header_lock);
+
 		if (!async)
 		{
-			_starpu_spin_lock(&handle->header_lock);
-			_starpu_notify_data_dependencies(handle);
-			_starpu_spin_unlock(&handle->header_lock);
+			/* Release our refcnt, like _starpu_release_data_on_node would do */
+			replicate->refcnt--;
+			STARPU_ASSERT(replicate->refcnt >= 0);
+			STARPU_ASSERT(handle->busy_count > 0);
+			handle->busy_count--;
+			_starpu_data_check_not_busy(handle);
 		}
+
+		/* In case there was a temporary handle (eg. used for reduction), this
+		 * handle may have requested to be destroyed when the data is released
+		 * */
+		unsigned handle_was_destroyed = handle->lazy_unregister;
+
+		_starpu_notify_data_dependencies(handle);
+
+		if (!handle_was_destroyed)
+			_starpu_spin_unlock(&handle->header_lock);
 	}
-	else {
-		PTHREAD_MUTEX_LOCK(&wrapper.lock);
-		while (!wrapper.finished)
-			PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
-		PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
+	else if (!async)
+	{
+		_STARPU_PTHREAD_MUTEX_LOCK(&wrapper->lock);
+		while (!wrapper->finished)
+			_STARPU_PTHREAD_COND_WAIT(&wrapper->cond, &wrapper->lock);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
+		free(wrapper);
 	}
 
 	return 0;
 }
 
-int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsigned async)
+int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
 	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R);
 }
@@ -359,7 +403,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsig
  *	It is possible to specify that a piece of data can be discarded without
  *	impacting the application.
  */
-void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_important)
+void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important)
 {
 	_starpu_spin_lock(&handle->header_lock);
 
@@ -368,7 +412,7 @@ void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_impo
 	for (child = 0; child < handle->nchildren; child++)
 	{
 		/* make sure the intermediate children is advised as well */
-		struct starpu_data_state_t *child_handle = &handle->children[child];
+		struct _starpu_data_state *child_handle = &handle->children[child];
 		if (child_handle->nchildren > 0)
 			starpu_data_advise_as_important(child_handle, is_important);
 	}
@@ -380,7 +424,7 @@ void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_impo
 
 }
 
-void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsigned flag)
+void starpu_data_set_sequential_consistency_flag(starpu_data_handle_t handle, unsigned flag)
 {
 	_starpu_spin_lock(&handle->header_lock);
 
@@ -388,14 +432,14 @@ void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsi
 	for (child = 0; child < handle->nchildren; child++)
 	{
 		/* make sure that the flags are applied to the children as well */
-		struct starpu_data_state_t *child_handle = &handle->children[child];
+		struct _starpu_data_state *child_handle = &handle->children[child];
 		if (child_handle->nchildren > 0)
 			starpu_data_set_sequential_consistency_flag(child_handle, flag);
 	}
 
-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	handle->sequential_consistency = flag;
-	PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 	_starpu_spin_unlock(&handle->header_lock);
 }
@@ -414,7 +458,7 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 }
 
 /* Query the status of the handle on the specified memory node. */
-void starpu_data_query_status(starpu_data_handle handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
+void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
 {
 #ifdef STARPU_DEVEL
 #warning FIXME

+ 32 - 16
src/datawizard/write_back.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,48 +17,64 @@
 
 #include <datawizard/datawizard.h>
 #include <datawizard/write_back.h>
+#include <core/dependencies/data_concurrency.h>
 
-void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_node, 
-					   uint32_t write_through_mask)
+static void wt_callback(void *arg)
 {
-	if ((write_through_mask & ~(1<<requesting_node)) == 0) {
+	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
+
+	_starpu_spin_lock(&handle->header_lock);
+	_starpu_notify_data_dependencies(handle);
+	_starpu_spin_unlock(&handle->header_lock);
+}
+
+void _starpu_write_through_data(starpu_data_handle_t handle, uint32_t requesting_node,
+				uint32_t write_through_mask)
+{
+	if ((write_through_mask & ~(1<<requesting_node)) == 0)
+	{
 		/* nothing will be done ... */
 		return;
 	}
 
 	/* first commit all changes onto the nodes specified by the mask */
-	uint32_t node;
-	for (node = 0; node < STARPU_MAXNODES; node++)
+	uint32_t node, max;
+	for (node = 0, max = starpu_memory_nodes_get_count(); node < max; node++)
 	{
-		if (write_through_mask & (1<<node)) {
+		if (write_through_mask & (1<<node))
+		{
 			/* we need to commit the buffer on that node */
-			if (node != requesting_node) 
+			if (node != requesting_node)
 			{
 				while (_starpu_spin_trylock(&handle->header_lock))
 					_starpu_datawizard_progress(requesting_node, 1);
 
-				starpu_data_request_t r;
-				r = create_request_to_fetch_data(handle, &handle->per_node[node],
-								STARPU_R, 0, NULL, NULL);
+				/* We need to keep a Read lock to avoid letting writers corrupt our copy.  */
+				STARPU_ASSERT(handle->current_mode != STARPU_REDUX);
+				STARPU_ASSERT(handle->current_mode != STARPU_SCRATCH);
+				handle->refcnt++;
+				handle->busy_count++;
+				handle->current_mode = STARPU_R;
+
+				struct _starpu_data_request *r;
+				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
+									 STARPU_R, 1, 1, wt_callback, handle);
 
 			        /* If no request was created, the handle was already up-to-date on the
 			         * node */
 			        if (r)
-				{
 				        _starpu_spin_unlock(&handle->header_lock);
-        				_starpu_wait_data_request_completion(r, 1);
-				}
 			}
 		}
 	}
 }
 
-void starpu_data_set_wt_mask(starpu_data_handle handle, uint32_t wt_mask)
+void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask)
 {
 	handle->wt_mask = wt_mask;
 
 	/* in case the data has some children, set their wt_mask as well */
-	if (handle->nchildren > 0) 
+	if (handle->nchildren > 0)
 	{
 		unsigned child;
 		for (child = 0; child < handle->nchildren; child++)

+ 2 - 2
src/datawizard/write_back.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 /* If a write-through mask is associated to that data handle, this propagates
  * the the current value of the data onto the different memory nodes in the
  * write_through_mask. */
-void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_node, 
+void _starpu_write_through_data(starpu_data_handle_t handle, uint32_t requesting_node,
 					   uint32_t write_through_mask);
 
 #endif // __DW_WRITE_BACK_H__

+ 57 - 59
src/drivers/cpu/driver_cpu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
@@ -27,22 +27,19 @@
 #include <core/sched_policy.h>
 #include <core/sched_ctx.h>
 
-static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args, int is_parallel_task, int rank, enum starpu_perf_archtype perf_arch)
+static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_args, int is_parallel_task, int rank, enum starpu_perf_archtype perf_arch)
 {
 	int ret;
 	struct timespec codelet_start, codelet_end;
 
-	unsigned calibrate_model = 0;
-	int workerid = cpu_args->workerid;
 	struct starpu_task *task = j->task;
-	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_codelet *cl = task->cl;
 
 	STARPU_ASSERT(cl);
-	STARPU_ASSERT(cl->cpu_func);
 
 	if (rank == 0)
 	{
-		ret = _starpu_fetch_task_input(task, 0);
+		ret = _starpu_fetch_task_input(j, 0);
 		if (ret != 0)
 		{
 			/* there was not enough memory so the codelet cannot be executed right now ... */
@@ -52,7 +49,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	}
 
 	if (is_parallel_task)
-		PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
+		_STARPU_PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
 
 	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank);
 
@@ -60,31 +57,27 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	 * execute the kernel at all. */
 	if ((rank == 0) || (cl->type != STARPU_FORKJOIN))
 	{
-		if (cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS) {
-			cl_func func = cl->cpu_func;
-			STARPU_ASSERT(func);
-			func(task->interfaces, task->cl_arg);
-		}
-		else {
-			if (cl->cpu_funcs[j->nimpl] != NULL) {
-				/* _STARPU_DEBUG("CPU driver : running kernel (%d)\n", j->nimpl); */
-				cl_func func = cl->cpu_funcs[j->nimpl];
-				STARPU_ASSERT(func);
-				func(task->interfaces, task->cl_arg);
-			}
-		}
+		_starpu_cl_func_t func = _starpu_task_get_cpu_nth_implementation(cl, j->nimpl);
+		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
+			/* bind to parallel worker */
+			_starpu_bind_thread_on_cpus(cpu_args->config, _starpu_get_combined_worker_struct(j->combined_workerid));
+		STARPU_ASSERT(func);
+		func(task->interfaces, task->cl_arg);
+		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
+			/* rebind to single CPU */
+			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid);
 	}
 
-	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);
+	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank);
 
 	if (is_parallel_task)
-		PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
+		_STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
 
 	if (rank == 0)
 	{
 		_starpu_driver_update_job_feedback(j, cpu_args,
 				perf_arch, &codelet_start, &codelet_end);
-		_starpu_push_task_output(task, 0);
+		_starpu_push_task_output(j, 0);
 	}
 
 	return 0;
@@ -92,7 +85,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
 void *_starpu_cpu_worker(void *arg)
 {
-	struct starpu_worker_s *cpu_arg = (struct starpu_worker_s *) arg;
+	struct _starpu_worker *cpu_arg = (struct _starpu_worker *) arg;
 	unsigned memnode = cpu_arg->memory_node;
 	int workerid = cpu_arg->workerid;
 	int devid = cpu_arg->devid;
@@ -100,7 +93,7 @@ void *_starpu_cpu_worker(void *arg)
 #ifdef STARPU_USE_FXT
 	_starpu_fxt_register_thread(cpu_arg->bindid);
 #endif
-	STARPU_TRACE_WORKER_INIT_START(STARPU_FUT_CPU_KEY, devid, memnode);
+	_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CPU_KEY, devid, memnode);
 
 	_starpu_bind_thread_on_cpu(cpu_arg->config, cpu_arg->bindid);
 
@@ -115,15 +108,15 @@ void *_starpu_cpu_worker(void *arg)
 
 	cpu_arg->status = STATUS_UNKNOWN;
 
-	STARPU_TRACE_WORKER_INIT_END
+	_STARPU_TRACE_WORKER_INIT_END
 
         /* tell the main thread that we are ready */
-	PTHREAD_MUTEX_LOCK(&cpu_arg->mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&cpu_arg->mutex);
 	cpu_arg->worker_is_initialized = 1;
-	PTHREAD_COND_SIGNAL(&cpu_arg->ready_cond);
-	PTHREAD_MUTEX_UNLOCK(&cpu_arg->mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&cpu_arg->ready_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&cpu_arg->mutex);
 
-        starpu_job_t j;
+        struct _starpu_job *j;
 	struct starpu_task *task;
 
 	int res;
@@ -133,42 +126,41 @@ void *_starpu_cpu_worker(void *arg)
 
 	while (_starpu_machine_is_running())
 	{
-		STARPU_TRACE_START_PROGRESS(memnode);
+		_STARPU_TRACE_START_PROGRESS(memnode);
 		_starpu_datawizard_progress(memnode, 1);
-		STARPU_TRACE_END_PROGRESS(memnode);
+		_STARPU_TRACE_END_PROGRESS(memnode);
 
 		/* take the mutex inside pop because it depends what mutex:
 		   the one of the local task or the one of one of the strategies */
 		task = _starpu_pop_task(cpu_arg);
 
-                if (!task) 
+                if (!task)
 		{
-			PTHREAD_MUTEX_LOCK(sched_mutex);
-			if (_starpu_worker_can_block(memnode)){
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+			if (_starpu_worker_can_block(memnode))
 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
-			}
 
-			PTHREAD_MUTEX_UNLOCK(sched_mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 			continue;
 		};
 
-		STARPU_ASSERT(task);
 
+		STARPU_ASSERT(task);
 		j = _starpu_get_job_associated_to_task(task);
-	
+
 		/* can a cpu perform that task ? */
-		if (!STARPU_CPU_MAY_PERFORM(j)) 
+		if (!_STARPU_CPU_MAY_PERFORM(j))
 		{
 			/* put it and the end of the queue ... XXX */
-			_starpu_push_task(j, 0);
+			_starpu_push_task(j);
 			continue;
 		}
 
 		int rank = 0;
 		int is_parallel_task = (j->task_size > 1);
 
-		enum starpu_perf_archtype perf_arch; 
-	
+		enum starpu_perf_archtype perf_arch;
+
 		/* Get the rank in case it is a parallel task */
 		if (is_parallel_task)
 		{
@@ -176,11 +168,11 @@ void *_starpu_cpu_worker(void *arg)
 			STARPU_ASSERT(task != j->task);
 			free(task);
 
-			PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+			_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 			rank = j->active_task_alias_count++;
-			PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
-			struct starpu_combined_worker_s *combined_worker;
+			struct _starpu_combined_worker *combined_worker;
 			combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
 
 			cpu_arg->combined_workerid = j->combined_workerid;
@@ -188,7 +180,8 @@ void *_starpu_cpu_worker(void *arg)
 			cpu_arg->current_rank = rank;
 			perf_arch = combined_worker->perf_arch;
 		}
-		else {
+		else
+		{
 			cpu_arg->combined_workerid = cpu_arg->workerid;
 			cpu_arg->worker_size = 1;
 			cpu_arg->current_rank = 0;
@@ -196,34 +189,39 @@ void *_starpu_cpu_worker(void *arg)
 		}
 
 		_starpu_set_current_task(j->task);
+		cpu_arg->current_task = j->task;
 
-		res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
+                res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
 
 		_starpu_set_current_task(NULL);
+		cpu_arg->current_task = NULL;
 
-		if (res) {
-			switch (res) {
+		if (res)
+		{
+			switch (res)
+			{
 				case -EAGAIN:
-					_starpu_push_task(j, 0);
+					_starpu_push_task(j);
 					continue;
-				default: 
-					assert(0);
+				default:
+					STARPU_ASSERT(0);
 			}
 		}
 
-		if (rank == 0){
-			_starpu_handle_job_termination(j, 0, workerid);
-		}
+		if (rank == 0)
+			_starpu_handle_job_termination(j, workerid);
         }
 
-	STARPU_TRACE_WORKER_DEINIT_START
+	_STARPU_TRACE_WORKER_DEINIT_START
+
+	_starpu_handle_all_pending_node_data_requests(memnode);
 
 	/* In case there remains some memory that was automatically
 	 * allocated by StarPU, we release it now. Note that data
 	 * coherency is not maintained anymore at that point ! */
 	_starpu_free_all_automatically_allocated_buffers(memnode);
 
-	STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CPU_KEY);
+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CPU_KEY);
 
 	pthread_exit(NULL);
 	return NULL;

+ 98 - 67
src/drivers/cuda/driver_cuda.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
@@ -33,6 +33,7 @@ static int ncudagpus;
 
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t transfer_streams[STARPU_NMAXWORKERS];
+static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 
 /* In case we want to cap the amount of memory available on the GPUs by the
  * mean of the STARPU_LIMIT_GPU_MEM, we allocate a big buffer when the driver
@@ -51,20 +52,17 @@ static void limit_gpu_mem_if_needed(int devid)
 	}
 
 	/* Find the size of the memory on the device */
-	struct cudaDeviceProp prop;
-	cures = cudaGetDeviceProperties(&prop, devid);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	size_t totalGlobalMem = prop.totalGlobalMem;
+	size_t totalGlobalMem = props[devid].totalGlobalMem;
 
 	/* How much memory to waste ? */
 	size_t to_waste = totalGlobalMem - (size_t)limit*1024*1024;
 
+	props[devid].totalGlobalMem -= to_waste;
+
 	_STARPU_DEBUG("CUDA device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
 			devid, (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024),
 			(size_t)(totalGlobalMem - to_waste)/(1024*1024));
-	
+
 	/* Allocate a large buffer to waste memory and constraint the amount of available memory. */
 	cures = cudaMalloc((void **)&wasted_memory[devid], to_waste);
 	if (STARPU_UNLIKELY(cures))
@@ -85,6 +83,11 @@ static void unlimit_gpu_mem_if_needed(int devid)
 	}
 }
 
+size_t starpu_cuda_get_global_mem_size(int devid)
+{
+	return (size_t)props[devid].totalGlobalMem;
+}
+
 cudaStream_t starpu_cuda_get_local_transfer_stream(void)
 {
 	int worker = starpu_worker_get_id();
@@ -99,6 +102,13 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 	return streams[worker];
 }
 
+const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	unsigned devid = config->workers[workerid].devid;
+	return &props[devid];
+}
+
 static void init_context(int devid)
 {
 	cudaError_t cures;
@@ -109,7 +119,24 @@ static void init_context(int devid)
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* force CUDA to initialize the context for real */
-	cudaFree(0);
+	cures = cudaFree(0);
+	if (STARPU_UNLIKELY(cures)) {
+		if (cures == cudaErrorDevicesUnavailable) {
+			fprintf(stderr,"All CUDA-capable devices are busy or unavailable\n");
+			exit(77);
+		}
+		STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	cures = cudaGetDeviceProperties(&props[devid], devid);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	if (props[devid].computeMode == cudaComputeModeExclusive) {
+		fprintf(stderr, "CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
+		STARPU_ASSERT(0);
+	}
+#endif
 
 	limit_gpu_mem_if_needed(devid);
 
@@ -146,7 +173,8 @@ unsigned _starpu_get_cuda_device_count(void)
 	if (STARPU_UNLIKELY(cures))
 		 return 0;
 
-	if (cnt > STARPU_MAXCUDADEVS) {
+	if (cnt > STARPU_MAXCUDADEVS)
+	{
 		fprintf(stderr, "# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
 		cnt = STARPU_MAXCUDADEVS;
 	}
@@ -156,10 +184,10 @@ unsigned _starpu_get_cuda_device_count(void)
 void _starpu_init_cuda(void)
 {
 	ncudagpus = _starpu_get_cuda_device_count();
-	assert(ncudagpus <= STARPU_MAXCUDADEVS);
+	STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
 }
 
-static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
+static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 {
 	int ret;
 	uint32_t mask = 0;
@@ -173,16 +201,17 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 	unsigned calibrate_model = 0;
 
 	STARPU_ASSERT(task);
-	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_codelet *cl = task->cl;
 	STARPU_ASSERT(cl);
 
-	if (cl->model && cl->model->benchmarking) 
+	if (cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
 
-	ret = _starpu_fetch_task_input(task, mask);
-	if (ret != 0) {
+	ret = _starpu_fetch_task_input(j, mask);
+	if (ret != 0)
+	{
 		/* there was not enough memory, so the input of
-		 * the codelet cannot be fetched ... put the 
+		 * the codelet cannot be fetched ... put the
 		 * codelet back, and try it later */
 		return -EAGAIN;
 	}
@@ -199,34 +228,26 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	/* We make sure we do manipulate the proper device */
 	cures = cudaSetDevice(args->devid);
+	if (STARPU_UNLIKELY(cures != cudaSuccess))
+		STARPU_CUDA_REPORT_ERROR(cures);
 #endif
 
-	if (cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS) {
-		cl_func func = cl->cuda_func;
-		STARPU_ASSERT(func);
-		func(task->interfaces, task->cl_arg);
-	}
-	else {
-		if (cl->cuda_funcs[j->nimpl] != NULL) {
-			/* _STARPU_DEBUG("Cuda driver : running kernel * (%d)\n", j->nimpl); */
-			cl_func func = cl->cuda_funcs[j->nimpl];
-			STARPU_ASSERT(func);
-			func(task->interfaces, task->cl_arg);
-		}
-	}
+	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
+	STARPU_ASSERT(func);
+	func(task->interfaces, task->cl_arg);
 
-	_starpu_driver_end_job(args, j, &codelet_end, 0);
+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
 
 	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
 
-	_starpu_push_task_output(task, mask);
+	_starpu_push_task_output(j, mask);
 
 	return 0;
 }
 
 void *_starpu_cuda_worker(void *arg)
 {
-	struct starpu_worker_s* args = arg;
+	struct _starpu_worker* args = arg;
 
 	int devid = args->devid;
 	int workerid = args->workerid;
@@ -235,7 +256,7 @@ void *_starpu_cuda_worker(void *arg)
 #ifdef STARPU_USE_FXT
 	_starpu_fxt_register_thread(args->bindid);
 #endif
-	STARPU_TRACE_WORKER_INIT_START(STARPU_FUT_CUDA_KEY, devid, memnode);
+	_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, devid, memnode);
 
 	_starpu_bind_thread_on_cpu(args->config, args->bindid);
 
@@ -252,26 +273,31 @@ void *_starpu_cuda_worker(void *arg)
 
 	/* get the device's name */
 	char devname[128];
-	struct cudaDeviceProp prop;
-	cudaGetDeviceProperties(&prop, devid);
-	strncpy(devname, prop.name, 128);
-#if CUDA_VERSION >= 3020
-	snprintf(args->name, sizeof(args->name), "CUDA %d (%s %02x:%02x.0)", args->devid, devname, prop.pciBusID, prop.pciDeviceID);
+	strncpy(devname, props[devid].name, 128);
+	float size = (float) props[devid].totalGlobalMem / (1<<30);
+
+#ifdef STARPU_HAVE_BUSID
+#ifdef STARPU_HAVE_DOMAINID
+	if (props[devid].pciDomainID)
+		snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB %04x:%02x:%02x.0)", args->devid, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
+	else
+#endif
+		snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB %02x:%02x.0)", args->devid, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
 #else
-	snprintf(args->name, sizeof(args->name), "CUDA %d (%s)", args->devid, devname);
+	snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB)", args->devid, devname, size);
 #endif
 	snprintf(args->short_name, sizeof(args->short_name), "CUDA %d", args->devid);
 	_STARPU_DEBUG("cuda (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
-	STARPU_TRACE_WORKER_INIT_END
+	_STARPU_TRACE_WORKER_INIT_END
 
 	/* tell the main thread that this one is ready */
-	PTHREAD_MUTEX_LOCK(&args->mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
 	args->worker_is_initialized = 1;
-	PTHREAD_COND_SIGNAL(&args->ready_cond);
-	PTHREAD_MUTEX_UNLOCK(&args->mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
 
-	struct starpu_job_s * j;
+	struct _starpu_job * j;
 	struct starpu_task *task;
 	int res;
 
@@ -280,60 +306,64 @@ void *_starpu_cuda_worker(void *arg)
 
 	while (_starpu_machine_is_running())
 	{
-		STARPU_TRACE_START_PROGRESS(memnode);
+		_STARPU_TRACE_START_PROGRESS(memnode);
 		_starpu_datawizard_progress(memnode, 1);
-		STARPU_TRACE_END_PROGRESS(memnode);
+		_STARPU_TRACE_END_PROGRESS(memnode);
 
 		task = _starpu_pop_task(args);
 
 		if (!task) 
 		{
-			PTHREAD_MUTEX_LOCK(sched_mutex);
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 			if (_starpu_worker_can_block(memnode))
 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
 		  
 
-			PTHREAD_MUTEX_UNLOCK(sched_mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 			continue;
 		};
 
 
 		STARPU_ASSERT(task);
-
 		j = _starpu_get_job_associated_to_task(task);
 
 		/* can CUDA do that task ? */
-		if (!STARPU_CUDA_MAY_PERFORM(j))
+		if (!_STARPU_CUDA_MAY_PERFORM(j))
 		{
 			/* this is neither a cuda or a cublas task */
-			_starpu_push_task(j, 0);
+			_starpu_push_task(j);
 			continue;
 		}
 
 		_starpu_set_current_task(task);
+		args->current_task = j->task;
 
 		res = execute_job_on_cuda(j, args);
 
-
 		_starpu_set_current_task(NULL);
+		args->current_task = NULL;
 
-		if (res) {
-			switch (res) {
+		if (res)
+		{
+			switch (res)
+			{
 				case -EAGAIN:
 					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
-					_starpu_push_task(j, 0);
+					_starpu_push_task(j);
 					STARPU_ABORT();
 					continue;
 				default:
-					assert(0);
+					STARPU_ASSERT(0);
 			}
 		}
 
-		_starpu_handle_job_termination(j, 0, workerid);
+		_starpu_handle_job_termination(j, workerid);
 	}
 
-	STARPU_TRACE_WORKER_DEINIT_START
+	_STARPU_TRACE_WORKER_DEINIT_START
+
+	_starpu_handle_all_pending_node_data_requests(memnode);
 
 	/* In case there remains some memory that was automatically
 	 * allocated by StarPU, we release it now. Note that data
@@ -342,7 +372,7 @@ void *_starpu_cuda_worker(void *arg)
 
 	deinit_context(args->workerid, args->devid);
 
-	STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CUDA_KEY);
+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
 
 	pthread_exit(NULL);
 
@@ -350,10 +380,11 @@ void *_starpu_cuda_worker(void *arg)
 
 }
 
-void starpu_cublas_report_error(const char *func, cublasStatus status)
+void starpu_cublas_report_error(const char *func, const char *file, int line, cublasStatus status)
 {
 	char *errormsg;
-	switch (status) {
+	switch (status)
+	{
 		case CUBLAS_STATUS_SUCCESS:
 			errormsg = "success";
 			break;
@@ -379,13 +410,13 @@ void starpu_cublas_report_error(const char *func, cublasStatus status)
 			errormsg = "unknown error";
 			break;
 	}
-	printf("oops in %s ... %s \n", func, errormsg);
-	assert(0);
+	printf("oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
 }
 
-void starpu_cuda_report_error(const char *func, cudaError_t status)
+void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status)
 {
 	const char *errormsg = cudaGetErrorString(status);
-	printf("oops in %s ... %s \n", func, errormsg);
-	assert(0);
+	printf("oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
 }

+ 35 - 28
src/drivers/driver_common/driver_common.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,68 +24,74 @@
 #include <core/debug.h>
 #include <drivers/driver_common/driver_common.h>
 #include <starpu_top.h>
+#include <core/sched_policy.h>
+#include <top/starpu_top_core.h>
 
-void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_start, int rank)
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank)
 {
 	struct starpu_task *task = j->task;
-	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_codelet *cl = task->cl;
 	struct starpu_task_profiling_info *profiling_info;
 	int profiling = starpu_profiling_status_get();
-	int starpu_top=starpu_top_status_get();
+	int starpu_top=_starpu_top_status_get();
 	int workerid = args->workerid;
 	unsigned calibrate_model = 0;
 
 	if (cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
 
+	if (rank == 0)
+		_starpu_sched_pre_exec_hook(task);
+
 	args->status = STATUS_EXECUTING;
-	task->status = STARPU_TASK_RUNNING;	
+	task->status = STARPU_TASK_RUNNING;
 
-	if (rank == 0) {
+	if (rank == 0)
+	{
 		cl->per_worker_stats[workerid]++;
 
 		profiling_info = task->profiling_info;
-	
+
 		if ((profiling && profiling_info) || calibrate_model || starpu_top)
 		{
-			starpu_clock_gettime(codelet_start);
+			_starpu_clock_gettime(codelet_start);
 			_starpu_worker_register_executing_start_date(workerid, codelet_start);
 		}
 	}
 
 	if (starpu_top)
-		starputop_task_started(task,workerid,codelet_start);
+		_starpu_top_task_started(task,workerid,codelet_start);
 
-	STARPU_TRACE_START_CODELET_BODY(j);
+	_STARPU_TRACE_START_CODELET_BODY(j);
 }
 
-void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_end, int rank)
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank)
 {
 	struct starpu_task *task = j->task;
-	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_codelet *cl = task->cl;
 	struct starpu_task_profiling_info *profiling_info = task->profiling_info;
 	int profiling = starpu_profiling_status_get();
-	int starpu_top=starpu_top_status_get();
+	int starpu_top=_starpu_top_status_get();
 	int workerid = args->workerid;
 	unsigned calibrate_model = 0;
-	enum starpu_perf_archtype archtype STARPU_ATTRIBUTE_UNUSED = args->perf_arch;
 
-	STARPU_TRACE_END_CODELET_BODY(j, archtype);
+	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch);
 
 	if (cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
 
-	if (rank == 0) {
+	if (rank == 0)
+	{
 		if ((profiling && profiling_info) || calibrate_model || starpu_top)
-			starpu_clock_gettime(codelet_end);
+			_starpu_clock_gettime(codelet_end);
 	}
 
 	if (starpu_top)
-	  starputop_task_ended(task,workerid,codelet_end);
+	  _starpu_top_task_ended(task,workerid,codelet_end);
 
 	args->status = STATUS_UNKNOWN;
 }
-void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
+void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
 					enum starpu_perf_archtype perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end)
 {
@@ -93,7 +99,7 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 	struct timespec measured_ts;
 	double measured;
 	int workerid = worker_args->workerid;
-	struct starpu_codelet_t *cl = j->task->cl;
+	struct starpu_codelet *cl = j->task->cl;
 	int calibrate_model = 0;
 	int profiling = starpu_profiling_status_get();
 	int updated = 0;
@@ -112,7 +118,7 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 			memcpy(&profiling_info->end_time, codelet_end, sizeof(struct timespec));
 
 			profiling_info->workerid = workerid;
-			
+
 			_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1,
 				profiling_info->used_cycles,
 				profiling_info->stall_cycles,
@@ -130,9 +136,10 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 	if (!updated)
 		_starpu_worker_update_profiling_info_executing(workerid, NULL, 1, 0, 0, 0);
 
-	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking) {
+	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
+	{
 		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
-		}
+	}
 }
 
 /* Workers may block when there is no work to do at all. We assume that the
@@ -141,17 +148,17 @@ void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *m
 {
 	struct timespec start_time, end_time;
 
-	STARPU_TRACE_WORKER_SLEEP_START
+	_STARPU_TRACE_WORKER_SLEEP_START
 	_starpu_worker_set_status(workerid, STATUS_SLEEPING);
 
-	starpu_clock_gettime(&start_time);
+	_starpu_clock_gettime(&start_time);
 	_starpu_worker_register_sleeping_start_date(workerid, &start_time);
 
-	PTHREAD_COND_WAIT(cond, mutex);
+	_STARPU_PTHREAD_COND_WAIT(cond, mutex);
 
 	_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
-	STARPU_TRACE_WORKER_SLEEP_END
-	starpu_clock_gettime(&end_time);
+	_STARPU_TRACE_WORKER_SLEEP_END
+	_starpu_clock_gettime(&end_time);
 
 	int profiling = starpu_profiling_status_get();
 	if (profiling)

+ 8 - 8
src/drivers/driver_common/driver_common.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,13 +23,13 @@
 #include <core/jobs.h>
 #include <common/utils.h>
 
-void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j,
-		struct timespec *codelet_start, int rank);
-void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j,
-		struct timespec *codelet_end, int rank);
-void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
-		enum starpu_perf_archtype perf_arch,
-		struct timespec *codelet_start, struct timespec *codelet_end);
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
+			      struct timespec *codelet_start, int rank);
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch,
+			    struct timespec *codelet_end, int rank);
+void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
+					enum starpu_perf_archtype perf_arch,
+					struct timespec *codelet_start, struct timespec *codelet_end);
 
 void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex);
 

+ 107 - 90
src/drivers/gordon/driver_gordon.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -34,14 +34,15 @@ pthread_t progress_thread;
 pthread_cond_t progress_cond;
 pthread_mutex_t progress_mutex;
 
-struct gordon_task_wrapper_s {
+struct gordon_task_wrapper_s
+{
 	/* who has executed that ? */
-	struct starpu_worker_s *worker;
+	struct _starpu_worker *worker;
 
-	struct starpu_job_list_s *list;	/* StarPU */
+	struct _starpu_job_list *list;	/* StarPU */
 	struct gordon_ppu_job_s *gordon_job; /* gordon*/
 
-	struct starpu_job_s *j; /* if there is a single task */
+	struct _starpu_job *j; /* if there is a single task */
 
 	/* debug */
 	unsigned terminated;
@@ -52,18 +53,19 @@ void *gordon_worker_progress(void *arg)
 	_STARPU_DEBUG("gordon_worker_progress\n");
 
 	/* fix the thread on the correct cpu */
-	struct starpu_worker_set_s *gordon_set_arg = arg;
-	unsigned prog_thread_bind_id = 
+	struct _starpu_worker_set *gordon_set_arg = arg;
+	unsigned prog_thread_bind_id =
 		(gordon_set_arg->workers[0].bindid + 1)%(gordon_set_arg->config->nhwcores);
 	_starpu_bind_thread_on_cpu(gordon_set_arg->config, prog_thread_bind_id);
 
-	PTHREAD_MUTEX_LOCK(&progress_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	progress_thread_is_inited = 1;
-	PTHREAD_COND_SIGNAL(&progress_cond);
-	PTHREAD_MUTEX_UNLOCK(&progress_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 
-	while (1) {
-		/* the Gordon runtime needs to make sure that we poll it 
+	while (1)
+	{
+		/* the Gordon runtime needs to make sure that we poll it
 		 * so that we handle jobs that are done */
 
 		/* wait for one task termination */
@@ -78,21 +80,22 @@ void *gordon_worker_progress(void *arg)
 	return NULL;
 }
 
-static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *gordon_job, uint32_t memory_node)
+static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_job_s *gordon_job, uint32_t memory_node)
 {
 	unsigned buffer;
 	unsigned nin = 0, ninout = 0, nout = 0;
 	unsigned in = 0, inout = 0, out = 0;
 
 	struct starpu_task *task = j->task;
-	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_codelet *cl = task->cl;
 
 	/* if it is non null, the argument buffer is considered
  	 * as the first read-only buffer */
-	if (task->cl_arg) {
+	if (task->cl_arg)
+	{
 		gordon_job->buffers[in] = (uint64_t)task->cl_arg;
 		gordon_job->ss[in].size = (uint32_t)task->cl_arg_size;
-		
+
 		nin++; in++;
 	}
 
@@ -100,10 +103,10 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 	unsigned nbuffers = cl->nbuffers;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		struct starpu_buffer_descr_t *descr;
-		descr = &task->buffers[buffer];
+		enum starpu_access_mode mode = cl->modes[buffer];
 
-		switch (descr->mode) {
+		switch (mode)
+		{
 			case STARPU_R:
 				nin++;
 				break;
@@ -120,10 +123,10 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		unsigned gordon_buffer;
-		struct starpu_buffer_descr_t *descr;
-		descr = &task->buffers[buffer];
+		enum starpu_access_mode mode = cl->modes[buffer];
 
-		switch (descr->mode) {
+		switch (mode)
+		{
 			case STARPU_R:
 				gordon_buffer = in++;
 				break;
@@ -136,7 +139,7 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 				break;
 		}
 
-		starpu_data_handle handle = task->buffers[buffer].handle;
+		starpu_data_handle_t handle = task->handles[buffer];
 
 		gordon_job->nalloc = 0;
 		gordon_job->nin = nin;
@@ -150,9 +153,9 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 	}
 }
 
-/* we assume the data are already available so that the data interface fields are 
+/* we assume the data are already available so that the data interface fields are
  * already filled */
-static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
+static struct gordon_task_wrapper_s *starpu_to_gordon_job(struct _starpu_job *j)
 {
 	struct gordon_ppu_job_s *gordon_job = gordon_alloc_jobs(1, 0);
 	struct gordon_task_wrapper_s *task_wrapper =
@@ -162,10 +165,7 @@ static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
 	task_wrapper->j = j;
 	task_wrapper->terminated = 0;
 
-	if (j->task->clgordon_func != STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS)
-		gordon_job->index = j->task->cl->gordon_func;
-	else
-		gordon_job->index = j->task->cl->gordon_funcs[j->nimpl];
+	gordon_job->index = _starpu_task_get_gordon_nth_implementation(j->task->cl, j->nimpl);
 
 	/* we should not hardcore the memory node ... XXX */
 	unsigned memory_node = 0;
@@ -174,21 +174,21 @@ static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
 	return task_wrapper;
 }
 
-static void handle_terminated_job(starpu_job_t j)
+static void handle_terminated_job(struct _starpu_job *j)
 {
-	_starpu_push_task_output(j->task, 0);
+	_starpu_push_task_output(j, 0);
 	_starpu_handle_job_termination(j, 0);
 	starpu_wake_all_blocked_workers();
 }
 
 static void gordon_callback_list_func(void *arg)
 {
-	struct gordon_task_wrapper_s *task_wrapper = arg; 
-	struct starpu_job_list_s *wrapper_list; 
+	struct gordon_task_wrapper_s *task_wrapper = arg;
+	struct _starpu_job_list *wrapper_list;
 
 	/* we don't know who will execute that codelet : so we actually defer the
  	 * execution of the StarPU codelet and the job termination later */
-	struct starpu_worker_s *worker = task_wrapper->worker;
+	struct _starpu_worker *worker = task_wrapper->worker;
 	STARPU_ASSERT(worker);
 
 	wrapper_list = task_wrapper->list;
@@ -200,12 +200,12 @@ static void gordon_callback_list_func(void *arg)
 	unsigned task_cnt = 0;
 
 	/* XXX 0 was hardcoded */
-	while (!starpu_job_list_empty(wrapper_list))
+	while (!_starpu_job_list_empty(wrapper_list))
 	{
-		starpu_job_t j = starpu_job_list_pop_back(wrapper_list);
+		struct _starpu_job *j = _starpu_job_list_pop_back(wrapper_list);
 
 		struct gordon_ppu_job_s * gordon_task = &task_wrapper->gordon_job[task_cnt];
-		struct starpu_perfmodel_t *model = j->task->cl->model;
+		struct starpu_perfmodel *model = j->task->cl->model;
 		if (model && model->benchmarking)
 		{
 			double measured = (double)gordon_task->measured;
@@ -214,7 +214,7 @@ static void gordon_callback_list_func(void *arg)
 			_starpu_update_perfmodel_history(j, j->task->cl->model, STARPU_GORDON_DEFAULT, cpuid, measured);
 		}
 
-		_starpu_push_task_output(j->task, 0);
+		_starpu_push_task_output(j, 0);
 		_starpu_handle_job_termination(j, 0);
 		//starpu_wake_all_blocked_workers();
 
@@ -222,7 +222,7 @@ static void gordon_callback_list_func(void *arg)
 	}
 
 	/* the job list was allocated by the gordon driver itself */
-	starpu_job_list_delete(wrapper_list);
+	_starpu_job_list_delete(wrapper_list);
 
 	starpu_wake_all_blocked_workers();
 	free(task_wrapper->gordon_job);
@@ -232,11 +232,11 @@ static void gordon_callback_list_func(void *arg)
 
 static void gordon_callback_func(void *arg)
 {
-	struct gordon_task_wrapper_s *task_wrapper = arg; 
+	struct gordon_task_wrapper_s *task_wrapper = arg;
 
 	/* we don't know who will execute that codelet : so we actually defer the
  	 * execution of the StarPU codelet and the job termination later */
-	struct starpu_worker_s *worker = task_wrapper->worker;
+	struct _starpu_worker *worker = task_wrapper->worker;
 	STARPU_ASSERT(worker);
 
 	task_wrapper->terminated = 1;
@@ -249,17 +249,20 @@ static void gordon_callback_func(void *arg)
 	free(task_wrapper);
 }
 
-int inject_task(starpu_job_t j, struct starpu_worker_s *worker)
+int inject_task(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	struct starpu_task *task = j->task;
-	int ret = _starpu_fetch_task_input(task, 0);
+	int ret = _starpu_fetch_task_input(j, 0);
 
-	if (ret != 0) {
+	if (ret != 0)
+	{
 		/* there was not enough memory so the codelet cannot be executed right now ... */
 		/* push the codelet back and try another one ... */
 		return STARPU_TRYAGAIN;
 	}
 
+	_starpu_sched_pre_exec_hook(task);
+
 	struct gordon_task_wrapper_s *task_wrapper = starpu_to_gordon_job(j);
 
 	task_wrapper->worker = worker;
@@ -269,31 +272,33 @@ int inject_task(starpu_job_t j, struct starpu_worker_s *worker)
 	return 0;
 }
 
-int inject_task_list(struct starpu_job_list_s *list, struct starpu_worker_s *worker)
+int inject_task_list(struct _starpu_job_list *list, struct _starpu_worker *worker)
 {
 	/* first put back all tasks that can not be performed by Gordon */
 	unsigned nvalids = 0;
 	unsigned ninvalids = 0;
-	starpu_job_t j;
+	struct _starpu_job *j;
 
 	// TODO !
-//	
-//	for (j = starpu_job_list_begin(list); j != starpu_job_list_end(list); j = starpu_job_list_next(j) )
+//
+//	for (j = _starpu_job_list_begin(list); j != _starpu_job_list_end(list); j = _starpu_job_list_next(j) )
 //	{
-//		if (!STARPU_GORDON_MAY_PERFORM(j)) {
+//		if (!_STARPU_GORDON_MAY_PERFORM(j))
+//              {
 //			// XXX TODO
 //			ninvalids++;
 //			assert(0);
 //		}
-//		else {
+//		else
+//              {
 //			nvalids++;
 //		}
 //	}
 
-	nvalids = job_list_size(list);
+	nvalids = _job_list_size(list);
 //	_STARPU_DEBUG("nvalids %d \n", nvalids);
 
-	
+
 
 	struct gordon_task_wrapper_s *task_wrapper = malloc(sizeof(struct gordon_task_wrapper_s));
 	gordon_job_t *gordon_jobs = gordon_alloc_jobs(nvalids, 0);
@@ -303,26 +308,28 @@ int inject_task_list(struct starpu_job_list_s *list, struct starpu_worker_s *wor
 	task_wrapper->j = NULL;
 	task_wrapper->terminated = 0;
 	task_wrapper->worker = worker;
-	
+
 	unsigned index;
-	for (j = starpu_job_list_begin(list), index = 0; j != starpu_job_list_end(list); j = starpu_job_list_next(j), index++)
+	for (j = _starpu_job_list_begin(list), index = 0; j != _starpu_job_list_end(list); j = _starpu_job_list_next(j), index++)
 	{
 		int ret;
 
 		struct starpu_task *task = j->task;
-		ret = _starpu_fetch_task_input(task, 0);
+		ret = _starpu_fetch_task_input(j, 0);
 		STARPU_ASSERT(!ret);
 
-		gordon_jobs[index].index = task->cl->gordon_func;
+		_starpu_sched_pre_exec_hook(task);
+
+		gordon_jobs[index].index = _starpu_task_get_gordon_nth_implementation(task->cl, j->nimpl);
 
-		struct starpu_perfmodel_t *model = j->task->cl->model;
+		struct starpu_perfmodel *model = j->task->cl->model;
 		if (model && model->benchmarking)
 			gordon_jobs[index].flags.sampling = 1;
 
 		/* we should not hardcore the memory node ... XXX */
 		unsigned memory_node = 0;
 		starpu_to_gordon_buffers(j, &gordon_jobs[index], memory_node);
-		
+
 	}
 
 	gordon_pushjob(task_wrapper->gordon_job, gordon_callback_list_func, task_wrapper);
@@ -330,27 +337,30 @@ int inject_task_list(struct starpu_job_list_s *list, struct starpu_worker_s *wor
 	return 0;
 }
 
-void *gordon_worker_inject(struct starpu_worker_set_s *arg)
+void *gordon_worker_inject(struct _starpu_worker_set *arg)
 {
 
-	while(_starpu_machine_is_running()) {
-		if (gordon_busy_enough()) {
+	while(_starpu_machine_is_running())
+	{
+		if (gordon_busy_enough())
+		{
 			/* gordon already has enough work, wait a little TODO */
 			_starpu_wait_on_sched_event();
 		}
-		else {
+		else
+		{
 #ifndef NOCHAIN
 			int ret = 0;
 #ifdef STARPU_DEVEL
 #warning we should look into the local job list here !
 #endif
 
-			struct starpu_job_list_s *list = _starpu_pop_every_task();
+			struct _starpu_job_list *list = _starpu_pop_every_task();
 			/* XXX 0 is hardcoded */
 			if (list)
 			{
 				/* partition lists */
-				unsigned size = job_list_size(list);
+				unsigned size = _starpu_job_list_size(list);
 				unsigned nchunks = (size<2*arg->nworkers)?size:(2*arg->nworkers);
 				//unsigned nchunks = (size<arg->nworkers)?size:(arg->nworkers);
 
@@ -360,20 +370,20 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 				unsigned chunk;
 				for (chunk = 0; chunk < nchunks; chunk++)
 				{
-					struct starpu_job_list_s *chunk_list;
+					struct _starpu_job_list *chunk_list;
 					if (chunk != (nchunks -1))
 					{
 						/* split the list in 2 parts : list = chunk_list | tail */
-						chunk_list = starpu_job_list_new();
+						chunk_list = _starpu_job_list_new();
 
 						/* find the end */
 						chunk_list->_head = list->_head;
 
-						starpu_job_itor_t it_j = starpu_job_list_begin(list);
+						struct _starpu_job *it_j = _starpu_job_list_begin(list);
 						unsigned ind;
 						for (ind = 0; ind < chunksize; ind++)
 						{
-							it_j = starpu_job_list_next(it_j);
+							it_j = _starpu_job_list_next(it_j);
 						}
 
 						/* it_j should be the first element of the new list (tail) */
@@ -382,7 +392,8 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 						list->_head = it_j;
 						it_j->_prev = NULL;
 					}
-					else {
+					else
+					{
 						/* this is the last chunk */
 						chunk_list = list;
 					}
@@ -390,26 +401,30 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 					ret = inject_task_list(chunk_list, &arg->workers[0]);
 				}
 			}
-			else {
+			else
+			{
 				_starpu_wait_on_sched_event();
 			}
 #else
 			/* gordon should accept a little more work */
-			starpu_job_t j;
+			struct _starpu_job *j;
 			j =  _starpu_pop_task();
 	//		_STARPU_DEBUG("pop task %p\n", j);
-			if (j) {
-				if (STARPU_GORDON_MAY_PERFORM(j)) {
+			if (j)
+			{
+				if (_STARPU_GORDON_MAY_PERFORM(j))
+				{
 					/* inject that task */
 					/* XXX we hardcore &arg->workers[0] for now */
 					inject_task(j, &arg->workers[0]);
 				}
-				else {
-					_starpu_push_task(j, 0);
+				else
+				{
+					_starpu_push_task(j);
 				}
 			}
 #endif
-			
+
 		}
 	}
 
@@ -418,12 +433,12 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
 void *_starpu_gordon_worker(void *arg)
 {
-	struct starpu_worker_set_s *gordon_set_arg = arg;
+	struct _starpu_worker_set *gordon_set_arg = arg;
 
 	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid);
 
 	/* TODO set_local_memory_node per SPU */
-	gordon_init(gordon_set_arg->nworkers);	
+	gordon_init(gordon_set_arg->nworkers);
 
 	/* NB: On SPUs, the worker_key is set to NULL since there is no point
 	 * in associating the PPU thread with a specific SPU (worker) while
@@ -434,7 +449,7 @@ void *_starpu_gordon_worker(void *arg)
 	unsigned spu;
 	for (spu = 0; spu < gordon_set_arg->nworkers; spu++)
 	{
-		struct starpu_worker_s *worker = &gordon_set_arg->workers[spu];
+		struct _starpu_worker *worker = &gordon_set_arg->workers[spu];
 		snprintf(worker->name, sizeof(worker->name), "SPU %d", worker->id);
 		snprintf(worker->short_name, sizeof(worker->short_name), "SPU %d", worker->id);
 	}
@@ -446,27 +461,29 @@ void *_starpu_gordon_worker(void *arg)
 	 */
 
 	/* launch the progression thread */
-	PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
-	PTHREAD_COND_INIT(&progress_cond, NULL);
-	
+	_STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
+
 	pthread_create(&progress_thread, NULL, gordon_worker_progress, gordon_set_arg);
 
 	/* wait for the progression thread to be ready */
-	PTHREAD_MUTEX_LOCK(&progress_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	while (!progress_thread_is_inited)
-		PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
-	PTHREAD_MUTEX_UNLOCK(&progress_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 
 	_STARPU_DEBUG("progress thread is running ... \n");
-	
+
 	/* tell the core that gordon is ready */
-	PTHREAD_MUTEX_LOCK(&gordon_set_arg->mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&gordon_set_arg->mutex);
 	gordon_set_arg->set_is_initialized = 1;
-	PTHREAD_COND_SIGNAL(&gordon_set_arg->ready_cond);
-	PTHREAD_MUTEX_UNLOCK(&gordon_set_arg->mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&gordon_set_arg->ready_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&gordon_set_arg->mutex);
 
 	gordon_worker_inject(gordon_set_arg);
 
+	_starpu_handle_all_pending_node_data_requests(memnode);
+
 	_STARPU_DEBUG("gordon deinit...\n");
 	gordon_deinit();
 	_STARPU_DEBUG("gordon was deinited\n");

+ 143 - 97
src/drivers/opencl/driver_opencl.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  *
@@ -80,6 +80,15 @@ static void unlimit_gpu_mem_if_needed(int devid)
 	}
 }
 
+size_t starpu_opencl_get_global_mem_size(int devid)
+{
+	cl_ulong totalGlobalMem;
+
+	/* Request the size of the current device's memory */
+	clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(totalGlobalMem), &totalGlobalMem, NULL);
+
+	return (size_t)totalGlobalMem;
+}
 
 void starpu_opencl_get_context(int devid, cl_context *context)
 {
@@ -98,14 +107,14 @@ void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
 
 void starpu_opencl_get_current_queue(cl_command_queue *queue)
 {
-	struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
 	STARPU_ASSERT(queue);
         *queue = queues[worker->devid];
 }
 
 void starpu_opencl_get_current_context(cl_context *context)
 {
-	struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
 	STARPU_ASSERT(context);
         *context = contexts[worker->devid];
 }
@@ -114,7 +123,7 @@ cl_int _starpu_opencl_init_context(int devid)
 {
 	cl_int err;
 
-	PTHREAD_MUTEX_LOCK(&big_lock);
+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
 
         _STARPU_DEBUG("Initialising context for dev %d\n", devid);
 
@@ -134,7 +143,7 @@ cl_int _starpu_opencl_init_context(int devid)
         transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-	PTHREAD_MUTEX_UNLOCK(&big_lock);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
 
 	limit_gpu_mem_if_needed(devid);
 
@@ -145,7 +154,7 @@ cl_int _starpu_opencl_deinit_context(int devid)
 {
         cl_int err;
 
-	PTHREAD_MUTEX_LOCK(&big_lock);
+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
 
         _STARPU_DEBUG("De-initialising context for dev %d\n", devid);
 
@@ -162,136 +171,168 @@ cl_int _starpu_opencl_deinit_context(int devid)
 
         contexts[devid] = NULL;
 
-	PTHREAD_MUTEX_UNLOCK(&big_lock);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
 
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags)
+cl_int starpu_opencl_allocate_memory(cl_mem *mem, size_t size, cl_mem_flags flags)
 {
 	cl_int err;
-        cl_mem address;
-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+        cl_mem memory;
+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
-	address = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-        *addr = address;
+        *mem = memory;
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event, int *ret)
+cl_int starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
 {
         cl_int err;
-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+
+        if (event)
+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
-        if (STARPU_LIKELY(err == CL_SUCCESS)) {
+        if (event)
+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+        if (STARPU_LIKELY(err == CL_SUCCESS))
+	{
                 *ret = (event == NULL) ? 0 : -EAGAIN;
                 return CL_SUCCESS;
         }
-        else {
-                if (event != NULL) {
+        else
+	{
+                if (event != NULL)
+		{
                         /* The asynchronous copy has failed, try to copy synchronously */
                         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
                 }
-                if (STARPU_LIKELY(err == CL_SUCCESS)) {
+                if (STARPU_LIKELY(err == CL_SUCCESS))
+		{
                         *ret = 0;
                         return CL_SUCCESS;
                 }
-                else {
+                else
+		{
                         STARPU_OPENCL_REPORT_ERROR(err);
                         return err;
                 }
         }
 }
 
-cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event)
+cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
 {
         cl_int err;
-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        if (event)
+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event, int *ret)
+cl_int starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
 {
         cl_int err;
-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
-        if (STARPU_LIKELY(err == CL_SUCCESS)) {
+        if (event)
+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+        if (STARPU_LIKELY(err == CL_SUCCESS))
+	{
                 *ret = (event == NULL) ? 0 : -EAGAIN;
                 return CL_SUCCESS;
         }
-        else {
+        else
+	{
                 if (event != NULL)
                         /* The asynchronous copy has failed, try to copy synchronously */
                         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
-                if (STARPU_LIKELY(err == CL_SUCCESS)) {
+                if (STARPU_LIKELY(err == CL_SUCCESS))
+		{
                         *ret = 0;
                         return CL_SUCCESS;
                 }
-                else {
+                else
+		{
                         STARPU_OPENCL_REPORT_ERROR(err);
                         return err;
                 }
         }
-
-        return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event)
+cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
 {
         cl_int err;
-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        if (event)
+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
 }
 
 #if 0
-cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
 {
         cl_int err;
-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueReadBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                       buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
+        if (event)
+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
 {
         cl_int err;
-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueWriteBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                        buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
+        if (event)
+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
@@ -300,9 +341,10 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const si
 
 void _starpu_opencl_init(void)
 {
-	PTHREAD_MUTEX_LOCK(&big_lock);
-        if (!init_done) {
-                cl_platform_id platform_id[STARPU_OPENCL_PLATFORM_MAX];
+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
+        if (!init_done)
+	{
+                cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
                 cl_uint nb_platforms;
                 cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
                 cl_int err;
@@ -311,26 +353,30 @@ void _starpu_opencl_init(void)
                 _STARPU_DEBUG("Initialising OpenCL\n");
 
                 // Get Platforms
-                err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
+                err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
                 if (err != CL_SUCCESS) nb_platforms=0;
                 _STARPU_DEBUG("Platforms detected: %d\n", nb_platforms);
 
                 // Get devices
                 nb_devices = 0;
                 {
-                        for (i=0; i<nb_platforms; i++) {
+                        for (i=0; i<nb_platforms; i++)
+			{
                                 cl_uint num;
 				int platform_valid = 1;
 				char name[1024], vendor[1024];
 
 				err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
-				if (err != CL_SUCCESS) {
+				if (err != CL_SUCCESS)
+				{
 					STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
 					platform_valid = 0;
 				}
-				else {
+				else
+				{
 					err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
-					if (err != CL_SUCCESS) {
+					if (err != CL_SUCCESS)
+					{
 						STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
 						platform_valid = 0;
 					}
@@ -341,12 +387,15 @@ void _starpu_opencl_init(void)
 				else
 					_STARPU_DEBUG("Platform invalid\n");
 #endif
-				if (platform_valid) {
+				if (platform_valid)
+				{
 					err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
-					if (err == CL_DEVICE_NOT_FOUND) {
+					if (err == CL_DEVICE_NOT_FOUND)
+					{
 						_STARPU_DEBUG("  No devices detected on this platform\n");
 					}
-					else {
+					else
+					{
 						if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 						_STARPU_DEBUG("  %d devices detected\n", num);
 						nb_devices += num;
@@ -358,13 +407,15 @@ void _starpu_opencl_init(void)
                 // Get location of OpenCl kernel source files
                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
 
-		if (nb_devices > STARPU_MAXOPENCLDEVS) {
-			_STARPU_DISP("# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
+		if (nb_devices > STARPU_MAXOPENCLDEVS)
+		{
+			_STARPU_DISP("# Warning: %u OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
 			nb_devices = STARPU_MAXOPENCLDEVS;
 		}
 
                 // initialise internal structures
-                for(i=0 ; i<nb_devices ; i++) {
+                for(i=0 ; i<nb_devices ; i++)
+		{
                         contexts[i] = NULL;
                         queues[i] = NULL;
                         transfer_queues[i] = NULL;
@@ -372,15 +423,15 @@ void _starpu_opencl_init(void)
 
                 init_done=1;
         }
-	PTHREAD_MUTEX_UNLOCK(&big_lock);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
 }
 
 static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
-static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args);
+static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args);
 
 void *_starpu_opencl_worker(void *arg)
 {
-	struct starpu_worker_s* args = arg;
+	struct _starpu_worker* args = arg;
 
 	int devid = args->devid;
 	int workerid = args->workerid;
@@ -390,7 +441,7 @@ void *_starpu_opencl_worker(void *arg)
 #endif
 
 	unsigned memnode = args->memory_node;
-	STARPU_TRACE_WORKER_INIT_START(STARPU_FUT_OPENCL_KEY, devid, memnode);
+	_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_OPENCL_KEY, devid, memnode);
 
 	_starpu_bind_thread_on_cpu(args->config, args->bindid);
 
@@ -413,77 +464,79 @@ void *_starpu_opencl_worker(void *arg)
 
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
-	STARPU_TRACE_WORKER_INIT_END
+	_STARPU_TRACE_WORKER_INIT_END
 
 	/* tell the main thread that this one is ready */
-	PTHREAD_MUTEX_LOCK(&args->mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
 	args->worker_is_initialized = 1;
-	PTHREAD_COND_SIGNAL(&args->ready_cond);
-	PTHREAD_MUTEX_UNLOCK(&args->mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
 
-	struct starpu_job_s * j;
+	struct _starpu_job * j;
 	struct starpu_task *task;
 	int res;
 
 	pthread_cond_t *sched_cond = &args->sched_cond;
-        pthread_mutex_t *sched_mutex = &args->sched_mutex;
+    pthread_mutex_t *sched_mutex = &args->sched_mutex;
 
 	while (_starpu_machine_is_running())
 	{
-		STARPU_TRACE_START_PROGRESS(memnode);
+		_STARPU_TRACE_START_PROGRESS(memnode);
 		_starpu_datawizard_progress(memnode, 1);
-		STARPU_TRACE_END_PROGRESS(memnode);
+		_STARPU_TRACE_END_PROGRESS(memnode);
 
 		task = _starpu_pop_task(args);
 		
 		if (task == NULL) 
 		{
-			PTHREAD_MUTEX_LOCK(sched_mutex);
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 			if (_starpu_worker_can_block(memnode))
 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
 
-			PTHREAD_MUTEX_UNLOCK(sched_mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 			continue;
 		};
 
-		PTHREAD_MUTEX_UNLOCK(sched_mutex);
-
 		STARPU_ASSERT(task);
 		j = _starpu_get_job_associated_to_task(task);
 
 		/* can OpenCL do that task ? */
-		if (!STARPU_OPENCL_MAY_PERFORM(j))
+		if (!_STARPU_OPENCL_MAY_PERFORM(j))
 		{
 			/* this is not a OpenCL task */
-			_starpu_push_task(j, 0);
+			_starpu_push_task(j);
 			continue;
 		}
 
 		_starpu_set_current_task(j->task);
+		args->current_task = j->task;
 
 		res = _starpu_opencl_execute_job(j, args);
 
-
-
 		_starpu_set_current_task(NULL);
+		args->current_task = NULL;
 
-                if (res) {
-			switch (res) {
+                if (res)
+		{
+			switch (res)
+			{
 				case -EAGAIN:
 					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
-					_starpu_push_task(j, 0);
+					_starpu_push_task(j);
 					STARPU_ABORT();
 					continue;
 				default:
-					assert(0);
+					STARPU_ASSERT(0);
 			}
 		}
 
-		_starpu_handle_job_termination(j, 0, workerid);
+		_starpu_handle_job_termination(j, workerid);
 	}
 
-	STARPU_TRACE_WORKER_DEINIT_START
+	_STARPU_TRACE_WORKER_DEINIT_START
+
+	_starpu_handle_all_pending_node_data_requests(memnode);
 
         _starpu_opencl_deinit_context(devid);
 
@@ -496,7 +549,8 @@ static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
 {
 	int err;
 
-        if (!init_done) {
+        if (!init_done)
+	{
                 _starpu_opencl_init();
         }
 
@@ -510,13 +564,14 @@ static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
 
 unsigned _starpu_opencl_get_device_count(void)
 {
-        if (!init_done) {
+        if (!init_done)
+	{
                 _starpu_opencl_init();
         }
 	return nb_devices;
 }
 
-static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args)
+static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
 {
 	int ret;
 	uint32_t mask = 0;
@@ -527,11 +582,12 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 	struct timespec codelet_start, codelet_end;
 
 	STARPU_ASSERT(task);
-	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_codelet *cl = task->cl;
 	STARPU_ASSERT(cl);
 
-	ret = _starpu_fetch_task_input(task, mask);
-	if (ret != 0) {
+	ret = _starpu_fetch_task_input(j, mask);
+	if (ret != 0)
+	{
 		/* there was not enough memory, so the input of
 		 * the codelet cannot be fetched ... put the
 		 * codelet back, and try it later */
@@ -540,26 +596,16 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 	_starpu_driver_start_job(args, j, &codelet_start, 0);
 
-	if (cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS) {
-		cl_func func = cl->opencl_func;
-		STARPU_ASSERT(func);
-		func(task->interfaces, task->cl_arg);
-	}
-	else {
-		if (cl->opencl_funcs[j->nimpl] != NULL) {
-			/* _STARPU_DEBUG("OpenCL driver : running kernel (%d)\n", j->nimpl); */
-			cl_func func = cl->opencl_funcs[j->nimpl];
-			STARPU_ASSERT(func);
-			func(task->interfaces, task->cl_arg);
-		}
-	}
+	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
+	STARPU_ASSERT(func);
+	func(task->interfaces, task->cl_arg);
 
-	_starpu_driver_end_job(args, j, &codelet_end, 0);
+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
 
 	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
 							&codelet_start, &codelet_end);
 
-	_starpu_push_task_output(task, mask);
+	_starpu_push_task_output(j, mask);
 
 	return EXIT_SUCCESS;
 }

+ 3 - 17
src/drivers/opencl/driver_opencl.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -39,29 +39,15 @@ int _starpu_opencl_deinit_context(int devid);
 extern
 unsigned _starpu_opencl_get_device_count(void);
 
-extern
-cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
-
-extern
-cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event);
-
-extern
-cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event);
-
-extern
-cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event, int *ret);
-
-extern
-cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event, int *ret);
 
 #if 0
 extern
-cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
 
 extern
-cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
 #endif

+ 125 - 44
src/drivers/opencl/driver_opencl_utils.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -39,27 +39,56 @@ char *_starpu_opencl_program_dir;
 #define _STARPU_STRINGIFY(x) _STARPU_STRINGIFY_(x)
 
 static
-int _starpu_opencl_locate_file(const char *source_file_name, char *located_file_name) {
+int _starpu_opencl_locate_file(const char *source_file_name, char *located_file_name, char *located_dir_name)
+{
+	int ret = EXIT_FAILURE;
+
         _STARPU_DEBUG("Trying to locate <%s>\n", source_file_name);
-        if (access(source_file_name, R_OK) == 0) {
+        if (access(source_file_name, R_OK) == 0)
+	{
                 strcpy(located_file_name, source_file_name);
-                return EXIT_SUCCESS;
+		ret = EXIT_SUCCESS;
         }
-        if (_starpu_opencl_program_dir) {
-                sprintf(located_file_name, "%s/%s", _starpu_opencl_program_dir, source_file_name);
-                _STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
-                if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
-        }
-        sprintf(located_file_name, "%s/%s", _STARPU_STRINGIFY(STARPU_OPENCL_DATADIR), source_file_name);
-        _STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
-        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
-        sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
-        _STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
-        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
-
-        strcpy(located_file_name, "");
-        _STARPU_ERROR("Cannot locate file <%s>\n", source_file_name);
-        return EXIT_FAILURE;
+
+	if (ret == EXIT_FAILURE && _starpu_opencl_program_dir)
+	{
+		sprintf(located_file_name, "%s/%s", _starpu_opencl_program_dir, source_file_name);
+		_STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
+		if (access(located_file_name, R_OK) == 0) ret = EXIT_SUCCESS;
+	}
+
+	if (ret == EXIT_FAILURE)
+	{
+		sprintf(located_file_name, "%s/%s", _STARPU_STRINGIFY(STARPU_OPENCL_DATADIR), source_file_name);
+		_STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
+		if (access(located_file_name, R_OK) == 0) ret = EXIT_SUCCESS;
+	}
+
+	if (ret == EXIT_FAILURE)
+	{
+		sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
+		_STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
+		if (access(located_file_name, R_OK) == 0) ret = EXIT_SUCCESS;
+	}
+
+	if (ret == EXIT_FAILURE)
+	{
+		strcpy(located_file_name, "");
+		strcpy(located_dir_name, "");
+		_STARPU_ERROR("Cannot locate file <%s>\n", source_file_name);
+	}
+	else
+	{
+		char *last = strrchr(located_file_name, '/');
+		if (!last) strcpy(located_dir_name, "");
+		else
+		{
+			sprintf(located_dir_name, "%s", located_file_name);
+			located_dir_name[strlen(located_file_name)-strlen(last)+1] = '\0';
+		}
+	}
+
+        return ret;
 }
 
 cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs,
@@ -75,7 +104,8 @@ cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, str
         starpu_opencl_get_queue(devid, queue);
 
         program = opencl_programs->programs[devid];
-        if (!program) {
+        if (!program)
+	{
                 _STARPU_DISP("Program not available\n");
                 return CL_INVALID_PROGRAM;
         }
@@ -87,7 +117,8 @@ cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, str
 	return CL_SUCCESS;
 }
 
-cl_int starpu_opencl_release_kernel(cl_kernel kernel) {
+cl_int starpu_opencl_release_kernel(cl_kernel kernel)
+{
 	cl_int err;
 
 	err = clReleaseKernel(kernel);
@@ -106,14 +137,15 @@ char *_starpu_opencl_load_program_source(const char *filename)
         char        c;
 
         fh = fopen(filename, "r");
-        if (fh == 0)
+        if (!fh)
                 return NULL;
 
         stat(filename, &statbuf);
         source = (char *) malloc(statbuf.st_size + 1);
 
-        for(c=fgetc(fh), x=0 ; c != EOF ; c = fgetc(fh), x++) {
-          source[x] = c;
+        for(c=(char)fgetc(fh), x=0 ; c != EOF ; c =(char)fgetc(fh), x++)
+	{
+		source[x] = c;
         }
         source[x] = '\0';
 
@@ -133,30 +165,34 @@ int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, str
 
         nb_devices = _starpu_opencl_get_device_count();
         // Iterate over each device
-        for(dev = 0; dev < nb_devices; dev ++) {
+        for(dev = 0; dev < nb_devices; dev ++)
+	{
                 cl_device_id device;
                 cl_context   context;
                 cl_program   program;
                 cl_int       err;
 
+                opencl_programs->programs[dev] = NULL;
+
                 starpu_opencl_get_device(dev, &device);
                 starpu_opencl_get_context(dev, &context);
-                if (context == NULL) {
+                if (context == NULL)
+		{
                         _STARPU_DEBUG("[%d] is not a valid OpenCL context\n", dev);
                         continue;
                 }
 
-                opencl_programs->programs[dev] = NULL;
-
-                if (context == NULL) continue;
-
                 // Create the compute program from the source buffer
                 program = clCreateProgramWithSource(context, 1, (const char **) &opencl_program_source, NULL, &err);
-                if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                if (!program || err != CL_SUCCESS) {
+			_STARPU_DISP("Error: Failed to load program source!\n");
+			return EXIT_FAILURE;
+		}
 
                 // Build the program executable
                 err = clBuildProgram(program, 1, &device, build_options, NULL, NULL);
-                if (err != CL_SUCCESS) {
+                if (err != CL_SUCCESS)
+		{
                         size_t len;
                         static char buffer[4096];
 
@@ -178,21 +214,32 @@ int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct sta
 {
 	int nb_devices;
         char located_file_name[1024];
+        char located_dir_name[1024];
+	char new_build_options[1024];
 
 	// Do not try to load and compile the file if there is no devices
-	nb_devices = _starpu_opencl_get_device_count();
+	nb_devices = starpu_opencl_worker_get_count();
 	if (nb_devices == 0) return EXIT_SUCCESS;
 
         // Locate source file
-        _starpu_opencl_locate_file(source_file_name, located_file_name);
+        _starpu_opencl_locate_file(source_file_name, located_file_name, located_dir_name);
         _STARPU_DEBUG("Source file name : <%s>\n", located_file_name);
+        _STARPU_DEBUG("Source directory name : <%s>\n", located_dir_name);
 
         // Load the compute program from disk into a cstring buffer
         char *opencl_program_source = _starpu_opencl_load_program_source(located_file_name);
         if(!opencl_program_source)
                 _STARPU_ERROR("Failed to load compute program from file <%s>!\n", located_file_name);
 
-        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs, build_options);
+	if (!strcmp(located_dir_name, ""))
+		strcpy(new_build_options, build_options);
+	else if (build_options)
+		sprintf(new_build_options, "-I %s %s", located_dir_name, build_options);
+	else
+		sprintf(new_build_options, "-I %s", located_dir_name);
+	_STARPU_DEBUG("Build options: <%s>\n", new_build_options);
+
+        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs, new_build_options);
 }
 
 cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
@@ -200,9 +247,13 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
         unsigned int dev;
         unsigned int nb_devices;
 
+	if (!starpu_opencl_worker_get_count())
+		return CL_SUCCESS;
+
         nb_devices = _starpu_opencl_get_device_count();
         // Iterate over each device
-        for(dev = 0; dev < nb_devices; dev ++) {
+        for(dev = 0; dev < nb_devices; dev ++)
+	{
                 if (opencl_programs->programs[dev])
                         clReleaseProgram(opencl_programs->programs[dev]);
         }
@@ -212,12 +263,13 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
 int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 {
 #if defined(CL_PROFILING_CLOCK_CYCLE_COUNT)||defined(CL_PROFILING_STALL_CYCLE_COUNT)||defined(CL_PROFILING_POWER_CONSUMED)
-	struct starpu_task *task = starpu_get_current_task();
+	struct starpu_task *task = starpu_task_get_current();
 	struct starpu_task_profiling_info *info = task->profiling_info;
 #endif
 
 #ifdef CL_PROFILING_CLOCK_CYCLE_COUNT
-	if (starpu_profiling_status_get() && info) {
+	if (starpu_profiling_status_get() && info)
+	{
 		cl_int err;
 		unsigned int clock_cycle_count;
 		size_t size;
@@ -228,7 +280,8 @@ int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 	}
 #endif
 #ifdef CL_PROFILING_STALL_CYCLE_COUNT
-	if (starpu_profiling_status_get() && info) {
+	if (starpu_profiling_status_get() && info)
+	{
 		cl_int err;
 		unsigned int stall_cycle_count;
 		size_t size;
@@ -240,7 +293,8 @@ int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 	}
 #endif
 #ifdef CL_PROFILING_POWER_CONSUMED
-	if (info && (starpu_profiling_status_get() || (task->cl && task->cl->power_model && task->cl->power_model->benchmarking))) {
+	if (info && (starpu_profiling_status_get() || (task->cl && task->cl->power_model && task->cl->power_model->benchmarking)))
+	{
 		cl_int err;
 		double power_consumed;
 		size_t size;
@@ -255,10 +309,11 @@ int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 	return 0;
 }
 
-void starpu_opencl_display_error(const char *func, const char* msg, cl_int status)
+void starpu_opencl_display_error(const char *func, const char *file, int line, const char* msg, cl_int status)
 {
 	const char *errormsg;
-	switch (status) {
+	switch (status)
+	{
 	case CL_SUCCESS:
 		errormsg = "success";
 		break;
@@ -407,8 +462,34 @@ void starpu_opencl_display_error(const char *func, const char* msg, cl_int statu
 		break;
 	}
 	if (msg)
-		printf("oops in %s (%s) ... <%s> (%d) \n", func, msg, errormsg, status);
+		printf("oops in %s (%s:%d) (%s) ... <%s> (%d) \n", func, file, line, msg, errormsg, status);
 	else
-		printf("oops in %s ... <%s> (%d) \n", func, errormsg, status);
+		printf("oops in %s (%s:%d) ... <%s> (%d) \n", func, file, line, errormsg, status);
+
+}
+
+int starpu_opencl_set_kernel_args(cl_int *error, cl_kernel *kernel, ...)
+{
+	int i;
+	va_list ap;
+
+	va_start(ap, kernel);
+
+	for (i = 0; ; i++)
+	{
+		int size = va_arg(ap, int);
+		if (size == 0)
+			break;
+
+		cl_mem *ptr = va_arg(ap, cl_mem *);
+		int err = clSetKernelArg(*kernel, i, size, ptr);
+		if (err != CL_SUCCESS)
+		{
+			*error = err;
+			break;
+		}
+	}
 
+	va_end(ap);
+	return i;
 }

+ 2 - 2
src/drivers/opencl/driver_opencl_utils.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,6 @@
 
 #include <config.h>
 
-#define STARPU_OPENCL_PLATFORM_MAX 4
+#define _STARPU_OPENCL_PLATFORM_MAX 4
 
 #endif /* __STARPU_OPENCL_UTILS_H__ */