13 years ago · 760bc799be
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,21 +21,27 @@
 
				 #include <datawizard/write_back.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				 #include <profiling/profiling.h>
			
 
				+#include <math.h>
			
 
				 
			
 
				-uint32_t _starpu_select_src_node(starpu_data_handle handle)
			
 
				+static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
			
 
				+uint32_t _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
			
 
				 {
			
 
				-	unsigned src_node = 0;
			
 
				+	int src_node = -1;
			
 
				 	unsigned i;
			
 
				 
			
 
				-	unsigned nnodes = _starpu_get_memory_nodes_count();
			
 
				+	unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				 
			
 
				 	/* first find a valid copy, either a STARPU_OWNER or a STARPU_SHARED */
			
 
				 	uint32_t node;
			
 
				 
			
 
				 	uint32_t src_node_mask = 0;
			
 
				+	size_t size = _starpu_data_get_size(handle);
			
 
				+	double cost = INFINITY;
			
 
				+
			
 
				 	for (node = 0; node < nnodes; node++)
			
 
				 	{
			
 
				-		if (handle->per_node[node].state != STARPU_INVALID) {
			
 
				+		if (handle->per_node[node].state != STARPU_INVALID)
			
 
				+		{
			
 
				 			/* we found a copy ! */
			
 
				 			src_node_mask |= (1<<node);
			
 
				 		}
			
@@ -44,7 +50,42 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
				 	/* we should have found at least one copy ! */
			
 
				 	STARPU_ASSERT(src_node_mask != 0);
			
 
				 
			
 
				-	/* find the node that will be the actual source */
			
 
				+	/* Without knowing the size, we won't know the cost */
			
 
				+	if (!size)
			
 
				+		cost = 0;
			
 
				+
			
 
				+	/* Check whether we have transfer cost for all nodes, if so, take the minimum */
			
 
				+	if (cost)
			
 
				+		for (i = 0; i < nnodes; i++)
			
 
				+		{
			
 
				+			if (src_node_mask & (1<<i))
			
 
				+			{
			
 
				+				double time = _starpu_predict_transfer_time(i, destination, size);
			
 
				+				unsigned handling_node;
			
 
				+
			
 
				+				/* Avoid indirect transfers */
			
 
				+				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
			
 
				+					continue;
			
 
				+
			
 
				+				if (_STARPU_IS_ZERO(time))
			
 
				+				{
			
 
				+					/* No estimation, will have to revert to dumb strategy */
			
 
				+					cost = 0.0;
			
 
				+					break;
			
 
				+				}
			
 
				+				else if (time < cost)
			
 
				+				{
			
 
				+					cost = time;
			
 
				+					src_node = i;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+	if (cost && src_node != -1)
			
 
				+		/* Could estimate through cost, return that */
			
 
				+		return src_node;
			
 
				+
			
 
				+	/* Revert to dumb strategy: take RAM unless only a GPU has it */
			
 
				 	for (i = 0; i < nnodes; i++)
			
 
				 	{
			
 
				 		if (src_node_mask & (1<<i))
			
@@ -53,36 +94,41 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
				 			src_node = i;
			
 
				 
			
 
				 			/* however GPU are expensive sources, really !
			
 
				-			 * 	other should be ok */
			
 
				-		 
			
 
				-			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM && _starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
			
 
				+			 * 	Unless peer transfer is supported.
			
 
				+			 * 	Other should be ok */
			
 
				+
			
 
				+			if (
			
 
				+#ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				+					starpu_node_get_kind(i) != STARPU_CUDA_RAM &&
			
 
				+#endif
			
 
				+					starpu_node_get_kind(i) != STARPU_OPENCL_RAM)
			
 
				 				break ;
			
 
				-		 
			
 
				-			/* XXX do a better algorithm to distribute the memory copies */
			
 
				-			/* TODO : use the "requesting_node" as an argument to do so */
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	STARPU_ASSERT(src_node != -1);
			
 
				+
			
 
				 	return src_node;
			
 
				 }
			
 
				 
			
 
				 /* this may be called once the data is fetched with header and STARPU_RW-lock hold */
			
 
				-void _starpu_update_data_state(starpu_data_handle handle,
			
 
				-				struct starpu_data_replicate_s *requesting_replicate,
			
 
				-				starpu_access_mode mode)
			
 
				+void _starpu_update_data_state(starpu_data_handle_t handle,
			
 
				+			       struct _starpu_data_replicate *requesting_replicate,
			
 
				+			       enum starpu_access_mode mode)
			
 
				 {
			
 
				 	/* There is nothing to do for relaxed coherency modes (scratch or
			
 
				 	 * reductions) */
			
 
				 	if (!(mode & STARPU_RW))
			
 
				 		return;
			
 
				 
			
 
				-	unsigned nnodes = _starpu_get_memory_nodes_count();
			
 
				+	unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				 
			
 
				 	/* the data is present now */
			
 
				 	unsigned requesting_node = requesting_replicate->memory_node;
			
 
				 	requesting_replicate->requested[requesting_node] = 0;
			
 
				 
			
 
				-	if (mode & STARPU_W) {
			
 
				+	if (mode & STARPU_W)
			
 
				+	{
			
 
				 		/* the requesting node now has the only valid copy */
			
 
				 		uint32_t node;
			
 
				 		for (node = 0; node < nnodes; node++)
			
@@ -90,14 +136,15 @@ void _starpu_update_data_state(starpu_data_handle handle,
 
				 
			
 
				 		requesting_replicate->state = STARPU_OWNER;
			
 
				 	}
			
 
				-	else { /* read only */
			
 
				+	else
			
 
				+	{ /* read only */
			
 
				 		if (requesting_replicate->state != STARPU_OWNER)
			
 
				 		{
			
 
				 			/* there was at least another copy of the data */
			
 
				 			uint32_t node;
			
 
				 			for (node = 0; node < nnodes; node++)
			
 
				 			{
			
 
				-				struct starpu_data_replicate_s *replicate = &handle->per_node[node];
			
 
				+				struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				 				if (replicate->state != STARPU_INVALID)
			
 
				 					replicate->state = STARPU_SHARED;
			
 
				 			}
			
@@ -111,14 +158,18 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 	if (node == handling_node)
			
 
				 		return 1;
			
 
				 
			
 
				-	int type = _starpu_get_node_kind(node);
			
 
				+	if (!_starpu_memory_node_workers(handling_node))
			
 
				+		/* No worker to process the request from that node */
			
 
				+		return 0;
			
 
				+
			
 
				+	int type = starpu_node_get_kind(node);
			
 
				 	switch (type)
			
 
				 	{
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 			/* GPUs not always allow direct remote access: if CUDA4
			
 
				 			 * is enabled, we allow two CUDA devices to communicate. */
			
 
				-			return (_starpu_get_node_kind(handling_node) != STARPU_OPENCL_RAM);
			
 
				+			return (starpu_node_get_kind(handling_node) != STARPU_OPENCL_RAM);
			
 
				 #else
			
 
				 			/* Direct GPU-GPU transfers are not allowed in general */
			
 
				 			return 0;
			
@@ -130,13 +181,15 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
			
 
				+static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
			
 
				 {
			
 
				+	(void) handle; // unused
			
 
				+
			
 
				 	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
			
 
				 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
			
 
				 	 * method ! */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	if (src_node != dst_node && _starpu_get_node_kind(src_node) == STARPU_CUDA_RAM && _starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				+	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				 	{
			
 
				 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
			
 
				 		if (!copy_methods->cuda_to_cuda_async)
			
@@ -163,11 +216,11 @@ static int link_supports_direct_transfers(starpu_data_handle handle, unsigned sr
 
				  * node that handles the hop. The returned value indicates the number of hops,
			
 
				  * and the max_len is the maximum number of hops (ie. the size of the
			
 
				  * src_nodes, dst_nodes and handling_nodes arrays. */
			
 
				-static int determine_request_path(starpu_data_handle handle,
			
 
				-				unsigned src_node, unsigned dst_node,
			
 
				-				starpu_access_mode mode, int max_len,
			
 
				-				unsigned *src_nodes, unsigned *dst_nodes,
			
 
				-				unsigned *handling_nodes)
			
 
				+static int determine_request_path(starpu_data_handle_t handle,
			
 
				+				  unsigned src_node, unsigned dst_node,
			
 
				+				  enum starpu_access_mode mode, int max_len,
			
 
				+				  unsigned *src_nodes, unsigned *dst_nodes,
			
 
				+				  unsigned *handling_nodes)
			
 
				 {
			
 
				 	if (!(mode & STARPU_R))
			
 
				 	{
			
@@ -182,7 +235,8 @@ static int determine_request_path(starpu_data_handle handle,
 
				 	unsigned handling_node;
			
 
				 	int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
			
 
				 
			
 
				-	if (!link_is_valid) {
			
 
				+	if (!link_is_valid)
			
 
				+	{
			
 
				 		/* We need an intermediate hop to implement data staging
			
 
				 		 * through main memory. */
			
 
				 		STARPU_ASSERT(max_len >= 2);
			
@@ -201,15 +255,16 @@ static int determine_request_path(starpu_data_handle handle,
 
				 
			
 
				 		return 2;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		STARPU_ASSERT(max_len >= 1);
			
 
				-		
			
 
				+
			
 
				 		src_nodes[0] = src_node;
			
 
				 		dst_nodes[0] = dst_node;
			
 
				 		handling_nodes[0] = handling_node;
			
 
				 
			
 
				 #ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				-		STARPU_ASSERT(!(mode & STARPU_R) || _starpu_get_node_kind(src_node) != STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) != STARPU_CUDA_RAM);
			
 
				+		STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
			
 
				 #endif
			
 
				 
			
 
				 		return 1;
			
@@ -219,9 +274,9 @@ static int determine_request_path(starpu_data_handle handle,
 
				 /* handle->lock should be taken. r is returned locked. The node parameter
			
 
				  * indicate either the source of the request, or the destination for a
			
 
				  * write-only request. */
			
 
				-static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, unsigned node, starpu_access_mode mode)
			
 
				+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_access_mode mode, unsigned is_prefetch)
			
 
				 {
			
 
				-	starpu_data_request_t r;
			
 
				+	struct _starpu_data_request *r;
			
 
				 
			
 
				 	r = replicate->request[node];
			
 
				 
			
@@ -229,20 +284,28 @@ static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_
 
				 	{
			
 
				 		_starpu_spin_lock(&r->lock);
			
 
				 
			
 
				-		/* perhaps we need to "upgrade" the request */
			
 
				+                /* perhaps we need to "upgrade" the request */
			
 
				+		if (is_prefetch < r->prefetch)
			
 
				+			_starpu_update_prefetch_status(r);
			
 
				+
			
 
				 		if (mode & STARPU_R)
			
 
				 		{
			
 
				 			/* in case the exisiting request did not imply a memory
			
 
				-			 * transfer yet, we have to increment the refcnt now
			
 
				+			 * transfer yet, we have to take a second refcnt now
			
 
				+			 * for the source, in addition to the refcnt for the
			
 
				+			 * destination
			
 
				 			 * (so that the source remains valid) */
			
 
				 			if (!(r->mode & STARPU_R))
			
 
				+			{
			
 
				 				replicate->refcnt++;
			
 
				+				replicate->handle->busy_count++;
			
 
				+			}
			
 
				 
			
 
				-			r->mode |= STARPU_R;
			
 
				+			r->mode = (enum starpu_access_mode) ((int) r->mode | (int) STARPU_R);
			
 
				 		}
			
 
				 
			
 
				 		if (mode & STARPU_W)
			
 
				-			r->mode |= STARPU_W;
			
 
				+			r->mode = (enum starpu_access_mode) ((int) r->mode | (int)  STARPU_W);
			
 
				 	}
			
 
				 
			
 
				 	return r;
			
@@ -252,38 +315,53 @@ static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_
 
				 
			
 
				 /*
			
 
				  * This function is called when the data is needed on the local node, this
			
 
				- * returns a pointer to the local copy 
			
 
				+ * returns a pointer to the local copy
			
 
				  *
			
 
				  *			R 	STARPU_W 	STARPU_RW
			
 
				  *	Owner		OK	OK	OK
			
 
				  *	Shared		OK	1	1
			
 
				  *	Invalid		2	3	4
			
 
				  *
			
 
				- * case 1 : shared + (read)write : 
			
 
				+ * case 1 : shared + (read)write :
			
 
				  * 	no data copy but shared->Invalid/Owner
			
 
				- * case 2 : invalid + read : 
			
 
				+ * case 2 : invalid + read :
			
 
				  * 	data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
			
 
				- * case 3 : invalid + write : 
			
 
				+ * case 3 : invalid + write :
			
 
				  * 	no data copy + invalid->owner + (owner,shared)->invalid
			
 
				- * case 4 : invalid + R/STARPU_W : 
			
 
				- * 	data copy + if (STARPU_W) (invalid->owner + owner->invalid) 
			
 
				+ * case 4 : invalid + R/STARPU_W :
			
 
				+ * 	data copy + if (STARPU_W) (invalid->owner + owner->invalid)
			
 
				  * 		    else (invalid,owner->shared)
			
 
				  */
			
 
				 
			
 
				 /* This function is called with handle's header lock taken */
			
 
				-starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
			
 
				-				struct starpu_data_replicate_s *dst_replicate,
			
 
				-                                starpu_access_mode mode, unsigned is_prefetch,
			
 
				-                                void (*callback_func)(void *), void *callback_arg)
			
 
				+struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				+								  struct _starpu_data_replicate *dst_replicate,
			
 
				+								  enum starpu_access_mode mode, unsigned is_prefetch,
			
 
				+								  unsigned async,
			
 
				+								  void (*callback_func)(void *), void *callback_arg)
			
 
				 {
			
 
				 	unsigned requesting_node = dst_replicate->memory_node;
			
 
				 
			
 
				 	if (dst_replicate->state != STARPU_INVALID)
			
 
				 	{
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+		enum _starpu_cache_state old_state = dst_replicate->state;
			
 
				+#endif
			
 
				 		/* the data is already available so we can stop */
			
 
				 		_starpu_update_data_state(handle, dst_replicate, mode);
			
 
				 		_starpu_msi_cache_hit(requesting_node);
			
 
				 
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+		_starpu_handle_stats_cache_hit(handle, requesting_node);
			
 
				+
			
 
				+		/* XXX Broken ? */
			
 
				+		if (old_state == STARPU_SHARED
			
 
				+		    && dst_replicate->state == STARPU_OWNER)
			
 
				+			_starpu_handle_stats_shared_to_owner(handle, requesting_node);
			
 
				+#endif
			
 
				+
			
 
				+		_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
			
 
				+
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		if (callback_func)
			
@@ -304,7 +382,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 	/* if the data is in write only mode, there is no need for a source */
			
 
				 	if (mode & STARPU_R)
			
 
				 	{
			
 
				-		src_node = _starpu_select_src_node(handle);
			
 
				+		src_node = _starpu_select_src_node(handle, requesting_node);
			
 
				 		STARPU_ASSERT(src_node != requesting_node);
			
 
				 	}
			
 
				 
			
@@ -313,9 +391,9 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 	unsigned src_nodes[4], dst_nodes[4], handling_nodes[4];
			
 
				 	int nhops = determine_request_path(handle, src_node, requesting_node, mode, 4,
			
 
				 					src_nodes, dst_nodes, handling_nodes);
			
 
				-	STARPU_ASSERT(nhops <= 4);
			
 
				 
			
 
				-	starpu_data_request_t requests[nhops];
			
 
				+	STARPU_ASSERT(nhops >= 1 && nhops <= 4);
			
 
				+	struct _starpu_data_request *requests[nhops];
			
 
				 
			
 
				 	/* Did we reuse a request for that hop ? */
			
 
				 	int reused_requests[nhops];
			
@@ -324,14 +402,14 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 	int hop;
			
 
				 	for (hop = 0; hop < nhops; hop++)
			
 
				 	{
			
 
				-		starpu_data_request_t r;
			
 
				+		struct _starpu_data_request *r;
			
 
				 
			
 
				 		unsigned hop_src_node = src_nodes[hop];
			
 
				 		unsigned hop_dst_node = dst_nodes[hop];
			
 
				 		unsigned hop_handling_node = handling_nodes[hop];
			
 
				 
			
 
				-		struct starpu_data_replicate_s *hop_src_replicate;
			
 
				-		struct starpu_data_replicate_s *hop_dst_replicate;
			
 
				+		struct _starpu_data_replicate *hop_src_replicate;
			
 
				+		struct _starpu_data_replicate *hop_dst_replicate;
			
 
				 
			
 
				 		/* Only the first request is independant */
			
 
				 		unsigned ndeps = (hop == 0)?0:1;
			
@@ -341,24 +419,26 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 
			
 
				 		/* Try to reuse a request if possible */
			
 
				 		r = _starpu_search_existing_data_request(hop_dst_replicate,
			
 
				-				(mode & STARPU_R)?hop_src_node:hop_dst_node, mode);
			
 
				+				(mode & STARPU_R)?hop_src_node:hop_dst_node,
			
 
				+							 mode, is_prefetch);
			
 
				 
			
 
				 		reused_requests[hop] = !!r;
			
 
				 
			
 
				-		if (!r) {
			
 
				+		if (!r)
			
 
				+		{
			
 
				 			/* Create a new request if there was no request to reuse */
			
 
				 			r = _starpu_create_data_request(handle, hop_src_replicate,
			
 
				-					hop_dst_replicate, hop_handling_node,
			
 
				-					mode, ndeps);
			
 
				+							hop_dst_replicate, hop_handling_node,
			
 
				+							mode, ndeps, is_prefetch);
			
 
				 		}
			
 
				 
			
 
				-		requests[hop] = r; 
			
 
				+		requests[hop] = r;
			
 
				 	}
			
 
				 
			
 
				 	/* Chain these requests */
			
 
				 	for (hop = 0; hop < nhops; hop++)
			
 
				 	{
			
 
				-		starpu_data_request_t r;
			
 
				+		struct _starpu_data_request *r;
			
 
				 		r = requests[hop];
			
 
				 
			
 
				 		if (hop != nhops - 1)
			
@@ -374,7 +454,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 			_starpu_spin_unlock(&r->lock);
			
 
				 	}
			
 
				 
			
 
				-	if (!is_prefetch)
			
 
				+	if (!async)
			
 
				 		requests[nhops - 1]->refcnt++;
			
 
				 
			
 
				 
			
@@ -386,9 +466,9 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 	return requests[nhops - 1];
			
 
				 }
			
 
				 
			
 
				-int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *dst_replicate,
			
 
				-				starpu_access_mode mode, unsigned is_prefetch,
			
 
				-				void (*callback_func)(void *), void *callback_arg)
			
 
				+int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *dst_replicate,
			
 
				+			       enum starpu_access_mode mode, unsigned detached, unsigned async,
			
 
				+			       void (*callback_func)(void *), void *callback_arg)
			
 
				 {
			
 
				 	uint32_t local_node = _starpu_get_local_memory_node();
			
 
				         _STARPU_LOG_IN();
			
@@ -396,57 +476,62 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_rep
 
				 	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 		_starpu_datawizard_progress(local_node, 1);
			
 
				 
			
 
				-	if (!is_prefetch)
			
 
				+	if (!detached)
			
 
				+	{
			
 
				+		/* Take a reference which will be released by _starpu_release_data_on_node */
			
 
				 		dst_replicate->refcnt++;
			
 
				+		dst_replicate->handle->busy_count++;
			
 
				+	}
			
 
				 
			
 
				-	starpu_data_request_t r;
			
 
				-	r = create_request_to_fetch_data(handle, dst_replicate, mode,
			
 
				-					is_prefetch, callback_func, callback_arg);
			
 
				+	struct _starpu_data_request *r;
			
 
				+	r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
			
 
				+						 detached, async, callback_func, callback_arg);
			
 
				 
			
 
				 	/* If no request was created, the handle was already up-to-date on the
			
 
				-	 * node. In this case, create_request_to_fetch_data has already
			
 
				+	 * node. In this case, _starpu_create_request_to_fetch_data has already
			
 
				 	 * unlocked the header. */
			
 
				 	if (!r)
			
 
				 		return 0;
			
 
				-	
			
 
				+
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				-	int ret = is_prefetch?0:_starpu_wait_data_request_completion(r, 1);
			
 
				+	int ret = async?0:_starpu_wait_data_request_completion(r, 1);
			
 
				         _STARPU_LOG_OUT();
			
 
				         return ret;
			
 
				 }
			
 
				 
			
 
				-static int prefetch_data_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, starpu_access_mode mode)
			
 
				+static int prefetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_access_mode mode)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, replicate, mode, 1, NULL, NULL);
			
 
				+	return _starpu_fetch_data_on_node(handle, replicate, mode, 1, 1, NULL, NULL);
			
 
				 }
			
 
				 
			
 
				-static int fetch_data(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, starpu_access_mode mode)
			
 
				+static int fetch_data(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_access_mode mode)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, replicate, mode, 0, NULL, NULL);
			
 
				+	return _starpu_fetch_data_on_node(handle, replicate, mode, 0, 0, NULL, NULL);
			
 
				 }
			
 
				 
			
 
				-uint32_t _starpu_get_data_refcnt(starpu_data_handle handle, uint32_t node)
			
 
				+uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				 	return handle->per_node[node].refcnt;
			
 
				 }
			
 
				 
			
 
				-size_t _starpu_data_get_size(starpu_data_handle handle)
			
 
				+size_t _starpu_data_get_size(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return handle->data_size;
			
 
				 }
			
 
				 
			
 
				-uint32_t _starpu_data_get_footprint(starpu_data_handle handle)
			
 
				+uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return handle->footprint;
			
 
				 }
			
 
				 
			
 
				-/* in case the data was accessed on a write mode, do not forget to 
			
 
				+/* in case the data was accessed on a write mode, do not forget to
			
 
				  * make it accessible again once it is possible ! */
			
 
				-void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt_mask, struct starpu_data_replicate_s *replicate)
			
 
				+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
			
 
				 {
			
 
				 	uint32_t wt_mask;
			
 
				 	wt_mask = default_wt_mask | handle->wt_mask;
			
 
				+	wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
			
 
				 
			
 
				 	/* Note that it is possible that there is no valid copy of the data (if
			
 
				 	 * starpu_data_invalidate was called for instance). In that case, we do
			
@@ -454,7 +539,7 @@ void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt
 
				 
			
 
				 	unsigned memory_node = replicate->memory_node;
			
 
				 
			
 
				-	if (replicate->state != STARPU_INVALID)
			
 
				+	if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
			
 
				 	if ((wt_mask & ~(1<<memory_node)))
			
 
				 		_starpu_write_through_data(handle, memory_node, wt_mask);
			
 
				 
			
@@ -462,10 +547,14 @@ void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt
 
				 	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 		_starpu_datawizard_progress(local_node, 1);
			
 
				 
			
 
				+	/* Release refcnt taken by fetch_data_on_node */
			
 
				 	replicate->refcnt--;
			
 
				-
			
 
				 	STARPU_ASSERT(replicate->refcnt >= 0);
			
 
				 
			
 
				+	STARPU_ASSERT(handle->busy_count > 0);
			
 
				+	handle->busy_count--;
			
 
				+	_starpu_data_check_not_busy(handle);
			
 
				+
			
 
				 	/* In case there was a temporary handle (eg. used for reduction), this
			
 
				 	 * handle may have requested to be destroyed when the data is released
			
 
				 	 * */
			
@@ -477,35 +566,34 @@ void _starpu_release_data_on_node(starpu_data_handle handle, uint32_t default_wt
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_set_data_requested_flag_if_needed(struct starpu_data_replicate_s *replicate)
			
 
				+static void _starpu_set_data_requested_flag_if_needed(struct _starpu_data_replicate *replicate)
			
 
				 {
			
 
				 // XXX : this is just a hint, so we don't take the lock ...
			
 
				-//	pthread_spin_lock(&handle->header_lock);
			
 
				+//	_STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
			
 
				 
			
 
				-	if (replicate->state == STARPU_INVALID) 
			
 
				+	if (replicate->state == STARPU_INVALID)
			
 
				 	{
			
 
				 		unsigned dst_node = replicate->memory_node;
			
 
				 		replicate->requested[dst_node] = 1;
			
 
				 	}
			
 
				 
			
 
				-//	pthread_spin_unlock(&handle->header_lock);
			
 
				+//	_STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				 int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node)
			
 
				 {
			
 
				-	starpu_buffer_descr *descrs = task->buffers;
			
 
				 	unsigned nbuffers = task->cl->nbuffers;
			
 
				-
			
 
				 	unsigned index;
			
 
				+
			
 
				 	for (index = 0; index < nbuffers; index++)
			
 
				 	{
			
 
				-		starpu_data_handle handle = descrs[index].handle;
			
 
				-		starpu_access_mode mode = descrs[index].mode;
			
 
				+		starpu_data_handle_t handle = task->handles[index];
			
 
				+		enum starpu_access_mode mode = task->cl->modes[index];
			
 
				 
			
 
				 		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
			
 
				 			continue;
			
 
				 
			
 
				-		struct starpu_data_replicate_s *replicate = &handle->per_node[node];
			
 
				+		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				 		prefetch_data_on_node(handle, replicate, mode);
			
 
				 
			
 
				 		_starpu_set_data_requested_flag_if_needed(replicate);
			
@@ -514,15 +602,25 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
			
 
				+static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_access_mode mode, int workerid, unsigned local_memory_node)
			
 
				+{
			
 
				+	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
			
 
				+		return &handle->per_worker[workerid];
			
 
				+	else
			
 
				+		/* That's a "normal" buffer (R/W) */
			
 
				+		return &handle->per_node[local_memory_node];
			
 
				+}
			
 
				+
			
 
				+int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
			
 
				 {
			
 
				-	STARPU_TRACE_START_FETCH_INPUT(NULL);
			
 
				+	_STARPU_TRACE_START_FETCH_INPUT(NULL);
			
 
				 
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				+	struct starpu_task *task = j->task;
			
 
				 	if (profiling && task->profiling_info)
			
 
				-		starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
			
 
				+		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
			
 
				 
			
 
				-	starpu_buffer_descr *descrs = task->buffers;
			
 
				+	struct starpu_buffer_descr *descrs = j->ordered_buffers;
			
 
				 	unsigned nbuffers = task->cl->nbuffers;
			
 
				 
			
 
				 	unsigned local_memory_node = _starpu_get_local_memory_node();
			
@@ -533,23 +631,33 @@ int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
 
				 	for (index = 0; index < nbuffers; index++)
			
 
				 	{
			
 
				 		int ret;
			
 
				-		starpu_data_handle handle = descrs[index].handle;
			
 
				-		starpu_access_mode mode = descrs[index].mode;
			
 
				+		starpu_data_handle_t handle = descrs[index].handle;
			
 
				+		enum starpu_access_mode mode = descrs[index].mode;
			
 
				 
			
 
				-		struct starpu_data_replicate_s *local_replicate;
			
 
				+		struct _starpu_data_replicate *local_replicate;
			
 
				 
			
 
				-		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
			
 
				-		{
			
 
				-			local_replicate = &handle->per_worker[workerid];
			
 
				-		}
			
 
				-		else {
			
 
				-			/* That's a "normal" buffer (R/W) */
			
 
				-			local_replicate = &handle->per_node[local_memory_node];
			
 
				-		}
			
 
				+		if (index && descrs[index-1].handle == descrs[index].handle)
			
 
				+			/* We have already took this data, skip it. This
			
 
				+			 * depends on ordering putting writes before reads, see
			
 
				+			 * _starpu_compar_handles */
			
 
				+			continue;
			
 
				+
			
 
				+		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
			
 
				 
			
 
				 		ret = fetch_data(handle, local_replicate, mode);
			
 
				 		if (STARPU_UNLIKELY(ret))
			
 
				 			goto enomem;
			
 
				+	}
			
 
				+
			
 
				+	/* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order.  */
			
 
				+	for (index = 0; index < nbuffers; index++)
			
 
				+	{
			
 
				+		starpu_data_handle_t handle = task->handles[index];
			
 
				+		enum starpu_access_mode mode = task->cl->modes[index];
			
 
				+
			
 
				+		struct _starpu_data_replicate *local_replicate;
			
 
				+
			
 
				+		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
			
 
				 
			
 
				 		task->interfaces[index] = local_replicate->data_interface;
			
 
				 
			
@@ -562,9 +670,9 @@ int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
 
				 	}
			
 
				 
			
 
				 	if (profiling && task->profiling_info)
			
 
				-		starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
			
 
				+		_starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
			
 
				 
			
 
				-	STARPU_TRACE_END_FETCH_INPUT(NULL);
			
 
				+	_STARPU_TRACE_END_FETCH_INPUT(NULL);
			
 
				 
			
 
				 	return 0;
			
 
				 
			
@@ -573,39 +681,40 @@ enomem:
 
				 	/* XXX broken ... */
			
 
				 	_STARPU_DISP("something went wrong with buffer %u\n", index);
			
 
				 	//push_codelet_output(task, index, mask);
			
 
				-	_starpu_push_task_output(task, mask);
			
 
				+	_starpu_push_task_output(j, mask);
			
 
				 	return -1;
			
 
				 }
			
 
				 
			
 
				-void _starpu_push_task_output(struct starpu_task *task, uint32_t mask)
			
 
				+void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
			
 
				 {
			
 
				-	STARPU_TRACE_START_PUSH_OUTPUT(NULL);
			
 
				+	_STARPU_TRACE_START_PUSH_OUTPUT(NULL);
			
 
				 
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				+	struct starpu_task *task = j->task;
			
 
				 	if (profiling && task->profiling_info)
			
 
				-		starpu_clock_gettime(&task->profiling_info->release_data_start_time);
			
 
				+		_starpu_clock_gettime(&task->profiling_info->release_data_start_time);
			
 
				 
			
 
				-        starpu_buffer_descr *descrs = task->buffers;
			
 
				+        struct starpu_buffer_descr *descrs = j->ordered_buffers;
			
 
				         unsigned nbuffers = task->cl->nbuffers;
			
 
				 
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	unsigned local_memory_node = _starpu_get_local_memory_node();
			
 
				+
			
 
				 	unsigned index;
			
 
				 	for (index = 0; index < nbuffers; index++)
			
 
				 	{
			
 
				-		starpu_data_handle handle = descrs[index].handle;
			
 
				-		starpu_access_mode mode = descrs[index].mode;
			
 
				+		starpu_data_handle_t handle = descrs[index].handle;
			
 
				+		enum starpu_access_mode mode = descrs[index].mode;
			
 
				 
			
 
				-		struct starpu_data_replicate_s *replicate;
			
 
				+		struct _starpu_data_replicate *local_replicate;
			
 
				 
			
 
				-		if (mode & STARPU_RW)
			
 
				-		{
			
 
				-			unsigned local_node = _starpu_get_local_memory_node();
			
 
				-			replicate = &handle->per_node[local_node];
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			int workerid = starpu_worker_get_id();
			
 
				-			replicate = &handle->per_worker[workerid];
			
 
				-		}
			
 
				+		if (index && descrs[index-1].handle == descrs[index].handle)
			
 
				+			/* We have already released this data, skip it. This
			
 
				+			 * depends on ordering putting writes before reads, see
			
 
				+			 * _starpu_compar_handles */
			
 
				+			continue;
			
 
				+
			
 
				+		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
			
 
				 
			
 
				 		/* In case there was a temporary handle (eg. used for
			
 
				 		 * reduction), this handle may have requested to be destroyed
			
@@ -613,33 +722,34 @@ void _starpu_push_task_output(struct starpu_task *task, uint32_t mask)
 
				 		 * */
			
 
				 		unsigned handle_was_destroyed = handle->lazy_unregister;
			
 
				 
			
 
				-		_starpu_release_data_on_node(handle, mask, replicate);
			
 
				+		_starpu_release_data_on_node(handle, mask, local_replicate);
			
 
				 		if (!handle_was_destroyed)
			
 
				 			_starpu_release_data_enforce_sequential_consistency(task, handle);
			
 
				 	}
			
 
				 
			
 
				 	if (profiling && task->profiling_info)
			
 
				-		starpu_clock_gettime(&task->profiling_info->release_data_end_time);
			
 
				+		_starpu_clock_gettime(&task->profiling_info->release_data_end_time);
			
 
				 
			
 
				-	STARPU_TRACE_END_PUSH_OUTPUT(NULL);
			
 
				+	_STARPU_TRACE_END_PUSH_OUTPUT(NULL);
			
 
				 }
			
 
				 
			
 
				 /* NB : this value can only be an indication of the status of a data
			
 
				 	at some point, but there is no strong garantee ! */
			
 
				-unsigned _starpu_is_data_present_or_requested(starpu_data_handle handle, uint32_t node)
			
 
				+unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				 	unsigned ret = 0;
			
 
				 
			
 
				 // XXX : this is just a hint, so we don't take the lock ...
			
 
				-//	pthread_spin_lock(&handle->header_lock);
			
 
				+//	_STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
			
 
				 
			
 
				 	if (handle->per_node[node].state != STARPU_INVALID)
			
 
				 	{
			
 
				 		ret  = 1;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		unsigned i;
			
 
				-		unsigned nnodes = _starpu_get_memory_nodes_count();
			
 
				+		unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				 
			
 
				 		for (i = 0; i < nnodes; i++)
			
 
				 		{
			
@@ -649,7 +759,7 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle handle, uint32_
 
				 
			
 
				 	}
			
 
				 
			
 
				-//	pthread_spin_unlock(&handle->header_lock);
			
 
				+//	_STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -31,15 +31,16 @@
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <datawizard/datastats.h>
			
 
				 
			
 
				-typedef enum {
			
 
				+enum _starpu_cache_state
			
 
				+{
			
 
				 	STARPU_OWNER,
			
 
				 	STARPU_SHARED,
			
 
				 	STARPU_INVALID
			
 
				-} starpu_cache_state;
			
 
				+};
			
 
				 
			
 
				 /* this should contain the information relative to a given data replicate  */
			
 
				-LIST_TYPE(starpu_data_replicate,
			
 
				-	starpu_data_handle handle;
			
 
				+LIST_TYPE(_starpu_data_replicate,
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				 	/* describe the actual data layout */
			
 
				 	void *data_interface;
			
@@ -55,65 +56,83 @@ LIST_TYPE(starpu_data_replicate,
 
				 	unsigned initialized;
			
 
				 
			
 
				 	/* describes the state of the local data in term of coherency */
			
 
				-	starpu_cache_state	state; 
			
 
				+	enum _starpu_cache_state	state;
			
 
				 
			
 
				 	int refcnt;
			
 
				 
			
 
				 	/* is the data locally allocated ? */
			
 
				-	uint8_t allocated; 
			
 
				-	/* was it automatically allocated ? */
			
 
				-	/* perhaps the allocation was perform higher in the hiearchy 
			
 
				+	uint8_t allocated;
			
 
				+	/* was it automatically allocated ? (else it's the application-provided
			
 
				+	 * buffer, don't ever try to free it!) */
			
 
				+	/* perhaps the allocation was perform higher in the hiearchy
			
 
				 	 * for now this is just translated into !automatically_allocated
			
 
				 	 * */
			
 
				 	uint8_t automatically_allocated;
			
 
				 
			
 
				+        /* Pointer to memchunk for LRU strategy */
			
 
				+	struct _starpu_mem_chunk * mc;
			
 
				+
			
 
				 	/* To help the scheduling policies to make some decision, we
			
 
				-	   may keep a track of the tasks that are likely to request 
			
 
				+	   may keep a track of the tasks that are likely to request
			
 
				 	   this data on the current node.
			
 
				 	   It is the responsability of the scheduling _policy_ to set that
			
 
				 	   flag when it assigns a task to a queue, policies which do not
			
 
				 	   use this hint can simply ignore it.
			
 
				 	 */
			
 
				 	uint8_t requested[STARPU_MAXNODES];
			
 
				-	struct starpu_data_request_s *request[STARPU_MAXNODES];
			
 
				-);
			
 
				+	struct _starpu_data_request *request[STARPU_MAXNODES];
			
 
				+)
			
 
				 
			
 
				-struct starpu_data_requester_list_s;
			
 
				+struct _starpu_data_requester_list;
			
 
				 
			
 
				-struct starpu_jobid_list {
			
 
				+struct _starpu_jobid_list
			
 
				+{
			
 
				 	unsigned long id;
			
 
				-	struct starpu_jobid_list *next;
			
 
				+	struct _starpu_jobid_list *next;
			
 
				 };
			
 
				 
			
 
				 /* This structure describes a simply-linked list of task */
			
 
				-struct starpu_task_wrapper_list {
			
 
				+struct _starpu_task_wrapper_list
			
 
				+{
			
 
				 	struct starpu_task *task;
			
 
				-	struct starpu_task_wrapper_list *next;
			
 
				+	struct _starpu_task_wrapper_list *next;
			
 
				 };
			
 
				 
			
 
				-struct starpu_data_state_t {
			
 
				-	struct starpu_data_requester_list_s *req_list;
			
 
				-	/* the number of requests currently in the scheduling engine
			
 
				-	 * (not in the req_list anymore) */
			
 
				+struct _starpu_data_state
			
 
				+{
			
 
				+	struct _starpu_data_requester_list *req_list;
			
 
				+	/* the number of requests currently in the scheduling engine (not in
			
 
				+	 * the req_list anymore), i.e. the number of holders of the
			
 
				+	 * current_mode rwlock */
			
 
				 	unsigned refcnt;
			
 
				-	starpu_access_mode current_mode;
			
 
				+	enum starpu_access_mode current_mode;
			
 
				 	/* protect meta data */
			
 
				-	starpu_spinlock_t header_lock;
			
 
				+	struct _starpu_spinlock header_lock;
			
 
				+
			
 
				+	/* Condition to make application wait for all transfers before freeing handle */
			
 
				+	/* busy_count is the number of handle->refcnt, handle->per_node[*]->refcnt, and number of starpu_data_requesters */
			
 
				+	/* Core code which releases busy_count has to call
			
 
				+	 * _starpu_data_check_not_busy to let starpu_data_unregister proceed */
			
 
				+	unsigned busy_count;
			
 
				+	/* Is starpu_data_unregister waiting for busy_count? */
			
 
				+	unsigned busy_waiting;
			
 
				+	pthread_mutex_t busy_mutex;
			
 
				+	pthread_cond_t busy_cond;
			
 
				 
			
 
				 	/* In case we user filters, the handle may describe a sub-data */
			
 
				-	struct starpu_data_state_t *root_handle; /* root of the tree */
			
 
				-	struct starpu_data_state_t *father_handle; /* father of the node, NULL if the current node is the root */
			
 
				+	struct _starpu_data_state *root_handle; /* root of the tree */
			
 
				+	struct _starpu_data_state *father_handle; /* father of the node, NULL if the current node is the root */
			
 
				 	unsigned sibling_index; /* indicate which child this node is from the father's perpsective (if any) */
			
 
				 	unsigned depth; /* what's the depth of the tree ? */
			
 
				 
			
 
				-	struct starpu_data_state_t *children;
			
 
				+	struct _starpu_data_state *children;
			
 
				 	unsigned nchildren;
			
 
				 
			
 
				 	/* describe the state of the data in term of coherency */
			
 
				-	struct starpu_data_replicate_s per_node[STARPU_MAXNODES];
			
 
				-	struct starpu_data_replicate_s per_worker[STARPU_NMAXWORKERS];
			
 
				+	struct _starpu_data_replicate per_node[STARPU_MAXNODES];
			
 
				+	struct _starpu_data_replicate per_worker[STARPU_NMAXWORKERS];
			
 
				 
			
 
				-	struct starpu_data_interface_ops_t *ops;
			
 
				+	struct starpu_data_interface_ops *ops;
			
 
				 
			
 
				 	/* To avoid recomputing data size all the time, we store it directly. */
			
 
				 	size_t data_size;
			
@@ -140,14 +159,14 @@ struct starpu_data_state_t {
 
				 	/* This lock should protect any operation to enforce
			
 
				 	 * sequential_consistency */
			
 
				 	pthread_mutex_t sequential_consistency_mutex;
			
 
				-	
			
 
				+
			
 
				 	/* The last submitted task (or application data request) that declared
			
 
				 	 * it would modify the piece of data ? Any task accessing the data in a
			
 
				 	 * read-only mode should depend on that task implicitely if the
			
 
				 	 * sequential_consistency flag is enabled. */
			
 
				-	starpu_access_mode last_submitted_mode;
			
 
				+	enum starpu_access_mode last_submitted_mode;
			
 
				 	struct starpu_task *last_submitted_writer;
			
 
				-	struct starpu_task_wrapper_list *last_submitted_readers;
			
 
				+	struct _starpu_task_wrapper_list *last_submitted_readers;
			
 
				 
			
 
				 	/* If FxT is enabled, we keep track of "ghost dependencies": that is to
			
 
				 	 * say the dependencies that are not needed anymore, but that should
			
@@ -157,9 +176,9 @@ struct starpu_data_state_t {
 
				 	 * enforce this dependency anymore.*/
			
 
				 	unsigned last_submitted_ghost_writer_id_is_valid;
			
 
				 	unsigned long last_submitted_ghost_writer_id;
			
 
				-	struct starpu_jobid_list *last_submitted_ghost_readers_id;
			
 
				-	
			
 
				-	struct starpu_task_wrapper_list *post_sync_tasks;
			
 
				+	struct _starpu_jobid_list *last_submitted_ghost_readers_id;
			
 
				+
			
 
				+	struct _starpu_task_wrapper_list *post_sync_tasks;
			
 
				 	unsigned post_sync_tasks_cnt;
			
 
				 
			
 
				 	/*
			
@@ -170,8 +189,8 @@ struct starpu_data_state_t {
 
				 	 * the reduction of an interface into another one (eg. "+="), and init_func
			
 
				 	 * initializes the data interface to a default value that is stable by
			
 
				 	 * reduction (eg. 0 for +=). */
			
 
				-	struct starpu_codelet_t *redux_cl;
			
 
				-	struct starpu_codelet_t *init_cl;
			
 
				+	struct starpu_codelet *redux_cl;
			
 
				+	struct starpu_codelet *init_cl;
			
 
				 
			
 
				 	/* Are we currently performing a reduction on that handle ? If so the
			
 
				 	 * reduction_refcnt should be non null until there are pending tasks
			
@@ -181,54 +200,77 @@ struct starpu_data_state_t {
 
				 	/* List of requesters that are specific to the pending reduction. This
			
 
				 	 * list is used when the requests in the req_list list are frozen until
			
 
				 	 * the end of the reduction. */
			
 
				-	struct starpu_data_requester_list_s *reduction_req_list;
			
 
				+	struct _starpu_data_requester_list *reduction_req_list;
			
 
				 
			
 
				-	starpu_data_handle reduction_tmp_handles[STARPU_NMAXWORKERS];
			
 
				+	starpu_data_handle_t reduction_tmp_handles[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 	unsigned lazy_unregister;
			
 
				 
			
 
				         /* Used for MPI */
			
 
				         int rank;
			
 
				 	int tag;
			
 
				+
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+	/* Handle access stats per node */
			
 
				+	unsigned stats_direct_access[STARPU_MAXNODES];
			
 
				+	unsigned stats_loaded_shared[STARPU_MAXNODES];
			
 
				+	unsigned stats_loaded_owner[STARPU_MAXNODES];
			
 
				+	unsigned stats_shared_to_owner[STARPU_MAXNODES];
			
 
				+	unsigned stats_invalidated[STARPU_MAXNODES];
			
 
				+#endif
			
 
				+
			
 
				+	unsigned int mf_node; //XXX
			
 
				 };
			
 
				 
			
 
				 void _starpu_display_msi_stats(void);
			
 
				 
			
 
				-int _starpu_fetch_data_on_node(struct starpu_data_state_t *state, struct starpu_data_replicate_s *replicate,
			
 
				-				starpu_access_mode mode, unsigned is_prefetch,
			
 
				-				void (*callback_func)(void *), void *callback_arg);
			
 
				-void _starpu_release_data_on_node(struct starpu_data_state_t *state, uint32_t default_wt_mask,
			
 
				-				struct starpu_data_replicate_s *replicate);
			
 
				+/* This does not take a reference on the handle, the caller has to do it,
			
 
				+ * e.g. through _starpu_attempt_to_submit_data_request_from_apps()
			
 
				+ * detached means that the core is allowed to drop the request. The caller
			
 
				+ * should thus *not* take a reference since it can not know whether the request will complete
			
 
				+ * async means that _starpu_fetch_data_on_node will wait for completion of the request
			
 
				+ */
			
 
				+int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate,
			
 
				+			       enum starpu_access_mode mode, unsigned detached, unsigned async,
			
 
				+			       void (*callback_func)(void *), void *callback_arg);
			
 
				+/* This releases a reference on the handle */
			
 
				+void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
			
 
				+				  struct _starpu_data_replicate *replicate);
			
 
				 
			
 
				-void _starpu_update_data_state(starpu_data_handle handle,
			
 
				-				struct starpu_data_replicate_s *requesting_replicate,
			
 
				-				starpu_access_mode mode);
			
 
				+void _starpu_update_data_state(starpu_data_handle_t handle,
			
 
				+			       struct _starpu_data_replicate *requesting_replicate,
			
 
				+			       enum starpu_access_mode mode);
			
 
				 
			
 
				-uint32_t _starpu_get_data_refcnt(struct starpu_data_state_t *state, uint32_t node);
			
 
				+uint32_t _starpu_get_data_refcnt(struct _starpu_data_state *state, uint32_t node);
			
 
				 
			
 
				-size_t _starpu_data_get_size(starpu_data_handle handle);
			
 
				+size_t _starpu_data_get_size(starpu_data_handle_t handle);
			
 
				 
			
 
				-uint32_t _starpu_data_get_footprint(starpu_data_handle handle);
			
 
				+uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle);
			
 
				 
			
 
				-void _starpu_push_task_output(struct starpu_task *task, uint32_t mask);
			
 
				+void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask);
			
 
				 
			
 
				 __attribute__((warn_unused_result))
			
 
				-int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask);
			
 
				+int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask);
			
 
				 
			
 
				-unsigned _starpu_is_data_present_or_requested(struct starpu_data_state_t *state, uint32_t node);
			
 
				-unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node);
			
 
				+unsigned _starpu_is_data_present_or_requested(struct _starpu_data_state *state, uint32_t node);
			
 
				+unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, uint32_t memory_node);
			
 
				 
			
 
				 
			
 
				-uint32_t _starpu_select_src_node(struct starpu_data_state_t *state);
			
 
				+uint32_t _starpu_select_src_node(struct _starpu_data_state *state, unsigned destination);
			
 
				 
			
 
				-starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
			
 
				-				struct starpu_data_replicate_s *dst_replicate,
			
 
				-                                starpu_access_mode mode, unsigned is_prefetch,
			
 
				-                                void (*callback_func)(void *), void *callback_arg);
			
 
				-
			
 
				-void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, int workerid);
			
 
				-void starpu_data_start_reduction_mode(starpu_data_handle handle);
			
 
				-void starpu_data_end_reduction_mode(starpu_data_handle handle);
			
 
				-void starpu_data_end_reduction_mode_terminate(starpu_data_handle handle);
			
 
				+/* is_prefetch is whether the DSM may drop the request (when there is not enough memory for instance
			
 
				+ * async is whether the caller wants a reference on the last request, to be
			
 
				+ * able to wait for it (which will release that reference).
			
 
				+ */
			
 
				+struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				+								  struct _starpu_data_replicate *dst_replicate,
			
 
				+								  enum starpu_access_mode mode, unsigned is_prefetch,
			
 
				+								  unsigned async,
			
 
				+								  void (*callback_func)(void *), void *callback_arg);
			
 
				+
			
 
				+void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, int workerid);
			
 
				+void _starpu_data_start_reduction_mode(starpu_data_handle_t handle);
			
 
				+void _starpu_data_end_reduction_mode(starpu_data_handle_t handle);
			
 
				+void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle);
			
 
				 
			
 
				 #endif // __COHERENCY__H__
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -32,23 +32,23 @@ void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid)
 
				 	/* wake up all workers on that memory node */
			
 
				 	unsigned cond_id;
			
 
				 
			
 
				-	starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
			
 
				+	struct _starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
			
 
				 
			
 
				-	PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
			
 
				+	_STARPU_PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
			
 
				 
			
 
				 	unsigned nconds = descr->condition_count[nodeid];
			
 
				 	for (cond_id = 0; cond_id < nconds; cond_id++)
			
 
				 	{
			
 
				-		struct _cond_and_mutex *condition;
			
 
				+		struct _starpu_cond_and_mutex *condition;
			
 
				 		condition  = &descr->conditions_attached_to_node[nodeid][cond_id];
			
 
				 
			
 
				 		/* wake anybody waiting on that condition */
			
 
				-		PTHREAD_MUTEX_LOCK(condition->mutex);
			
 
				-		PTHREAD_COND_BROADCAST(condition->cond);
			
 
				-		PTHREAD_MUTEX_UNLOCK(condition->mutex);
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(condition->mutex);
			
 
				+		_STARPU_PTHREAD_COND_BROADCAST(condition->cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(condition->mutex);
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
			
 
				 }
			
 
				 
			
 
				 void starpu_wake_all_blocked_workers(void)
			
@@ -56,23 +56,23 @@ void starpu_wake_all_blocked_workers(void)
 
				 	/* workers may be blocked on the various queues' conditions */
			
 
				 	unsigned cond_id;
			
 
				 
			
 
				-	starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
			
 
				+	struct _starpu_mem_node_descr * const descr = _starpu_get_memory_node_description();
			
 
				 
			
 
				-	PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
			
 
				+	_STARPU_PTHREAD_RWLOCK_RDLOCK(&descr->conditions_rwlock);
			
 
				 
			
 
				 	unsigned nconds = descr->total_condition_count;
			
 
				 	for (cond_id = 0; cond_id < nconds; cond_id++)
			
 
				 	{
			
 
				-		struct _cond_and_mutex *condition;
			
 
				+		struct _starpu_cond_and_mutex *condition;
			
 
				 		condition  = &descr->conditions_all[cond_id];
			
 
				 
			
 
				 		/* wake anybody waiting on that condition */
			
 
				-		PTHREAD_MUTEX_LOCK(condition->mutex);
			
 
				-		PTHREAD_COND_BROADCAST(condition->cond);
			
 
				-		PTHREAD_MUTEX_UNLOCK(condition->mutex);
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(condition->mutex);
			
 
				+		_STARPU_PTHREAD_COND_BROADCAST(condition->cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(condition->mutex);
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr->conditions_rwlock);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
@@ -82,7 +82,10 @@ void starpu_wake_all_blocked_workers(void)
 
				 static unsigned communication_cnt = 0;
			
 
				 #endif
			
 
				 
			
 
				-static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_data_replicate_s *src_replicate, struct starpu_data_replicate_s *dst_replicate, struct starpu_data_request_s *req STARPU_ATTRIBUTE_UNUSED)
			
 
				+static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
			
 
				+				    struct _starpu_data_replicate *src_replicate,
			
 
				+				    struct _starpu_data_replicate *dst_replicate,
			
 
				+				    struct _starpu_data_request *req STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				 
			
@@ -91,8 +94,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	unsigned src_node = src_replicate->memory_node;
			
 
				 	unsigned dst_node = dst_replicate->memory_node;
			
 
				 
			
 
				-	starpu_node_kind src_kind = _starpu_get_node_kind(src_node);
			
 
				-	starpu_node_kind dst_kind = _starpu_get_node_kind(dst_node);
			
 
				+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
			
 
				+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
			
 
				 
			
 
				 	STARPU_ASSERT(src_replicate->refcnt);
			
 
				 	STARPU_ASSERT(dst_replicate->refcnt);
			
@@ -105,6 +108,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	cudaStream_t stream;
			
 
				 #endif
			
 
				 
			
 
				+	_starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle));
			
 
				+
			
 
				 	void *src_interface = src_replicate->data_interface;
			
 
				 	void *dst_interface = dst_replicate->data_interface;
			
 
				 
			
@@ -112,12 +117,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
			
 
				 	{
			
 
				 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
			
 
				-		cures = cudaSetDevice(starpu_memory_node_to_devid(node));
			
 
				+		cures = cudaSetDevice(_starpu_memory_node_to_devid(node));
			
 
				 		STARPU_ASSERT(cures == cudaSuccess);
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) {
			
 
				+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
			
 
				+	{
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
			
 
				 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
			
 
				 		STARPU_ASSERT(copy_methods->ram_to_ram);
			
@@ -126,12 +132,17 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
 
				 		/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				+#if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+		STARPU_ASSERT(_starpu_get_local_memory_node() == src_node);
			
 
				+#endif
			
 
				 		STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				-		if (!req || !copy_methods->cuda_to_ram_async) {
			
 
				+		if (!req || !copy_methods->cuda_to_ram_async)
			
 
				+		{
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				 			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -146,19 +157,23 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
 
				 		/* STARPU_CPU_RAM -> CUBLAS_RAM */
			
 
				 		/* only the proper CUBLAS thread can initiate this ! */
			
 
				+#if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				 		STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
			
 
				+#endif
			
 
				 		STARPU_ASSERT(copy_methods->ram_to_cuda);
			
 
				-		if (!req || !copy_methods->ram_to_cuda_async) {
			
 
				+		if (!req || !copy_methods->ram_to_cuda_async)
			
 
				+		{
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			stream = starpu_cuda_get_local_stream();
			
 
				+			stream = starpu_cuda_get_local_transfer_stream();
			
 
				 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
@@ -169,16 +184,19 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
 
				 		/* CUDA - CUDA transfer */
			
 
				 		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
			
 
				-		if (!req || !copy_methods->cuda_to_cuda_async) {
			
 
				+		if (!req || !copy_methods->cuda_to_cuda_async)
			
 
				+		{
			
 
				+			STARPU_ASSERT(copy_methods->cuda_to_cuda);
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				 			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			stream = starpu_cuda_get_local_stream();
			
 
				+			stream = starpu_cuda_get_local_transfer_stream();
			
 
				 			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
@@ -189,18 +207,22 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
			
 
				 		/* OpenCL -> RAM */
			
 
				-		if (_starpu_get_local_memory_node() == src_node) {
			
 
				+		if (_starpu_get_local_memory_node() == src_node)
			
 
				+		{
			
 
				 			STARPU_ASSERT(copy_methods->opencl_to_ram);
			
 
				-			if (!req || !copy_methods->opencl_to_ram_async) {
			
 
				+			if (!req || !copy_methods->opencl_to_ram_async)
			
 
				+			{
			
 
				 				/* this is not associated to a request so it's synchronous */
			
 
				 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				 				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				 			}
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* we should not have a blocking call ! */
			
 
				 			STARPU_ABORT();
			
 
				 		}
			
@@ -209,11 +231,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
			
 
				 		STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
			
 
				 		STARPU_ASSERT(copy_methods->ram_to_opencl);
			
 
				-		if (!req || !copy_methods->ram_to_opencl_async) {
			
 
				+		if (!req || !copy_methods->ram_to_opencl_async)
			
 
				+		{
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				 			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				 		}
			
@@ -227,12 +251,12 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
			
 
				-						struct starpu_data_replicate_s *src_replicate,
			
 
				-						struct starpu_data_replicate_s *dst_replicate,
			
 
				-						unsigned donotread,
			
 
				-						struct starpu_data_request_s *req,
			
 
				-						unsigned may_alloc)
			
 
				+int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
			
 
				+									struct _starpu_data_replicate *src_replicate,
			
 
				+									struct _starpu_data_replicate *dst_replicate,
			
 
				+									unsigned donotread,
			
 
				+									struct _starpu_data_request *req,
			
 
				+									unsigned may_alloc)
			
 
				 {
			
 
				 	if (!donotread)
			
 
				 	{
			
@@ -252,7 +276,7 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 		if (!may_alloc)
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				-		ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate);
			
 
				+		ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate,req->prefetch);
			
 
				 		if (ret_alloc)
			
 
				 			return -ENOMEM;
			
 
				 	}
			
@@ -260,12 +284,13 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	STARPU_ASSERT(dst_replicate->allocated);
			
 
				 	STARPU_ASSERT(dst_replicate->refcnt);
			
 
				 
			
 
				-	/* if there is no need to actually read the data, 
			
 
				+	/* if there is no need to actually read the data,
			
 
				 	 * we do not perform any transfer */
			
 
				-	if (!donotread) {
			
 
				+	if (!donotread)
			
 
				+	{
			
 
				 		size_t size = _starpu_data_get_size(handle);
			
 
				 		_starpu_bus_update_profiling_info((int)src_node, (int)dst_node, size);
			
 
				-		
			
 
				+
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		com_id = STARPU_ATOMIC_ADD(&communication_cnt, 1);
			
 
				 
			
@@ -273,13 +298,13 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 			req->com_id = com_id;
			
 
				 #endif
			
 
				 
			
 
				-		STARPU_TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id);
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id);
			
 
				 		ret_copy = copy_data_1_to_1_generic(handle, src_replicate, dst_replicate, req);
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		if (ret_copy != -EAGAIN)
			
 
				 		{
			
 
				-			STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id);
			
 
				+			_STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id);
			
 
				 		}
			
 
				 #endif
			
 
				 
			
@@ -289,83 +314,84 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel)
			
 
				+void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
			
 
				 {
			
 
				-	starpu_node_kind kind = async_channel->type;
			
 
				+	enum starpu_node_kind kind = async_channel->type;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t event;
			
 
				 	cudaError_t cures;
			
 
				 #endif
			
 
				 
			
 
				-	switch (kind) {
			
 
				+	switch (kind)
			
 
				+	{
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-		case STARPU_CUDA_RAM:
			
 
				-			event = (*async_channel).event.cuda_event;
			
 
				+	case STARPU_CUDA_RAM:
			
 
				+		event = (*async_channel).event.cuda_event;
			
 
				 
			
 
				-			cures = cudaEventSynchronize(event);
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+		cures = cudaEventSynchronize(event);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			cures = cudaEventDestroy(event);
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+		cures = cudaEventDestroy(event);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			break;
			
 
				+		break;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-      case STARPU_OPENCL_RAM:
			
 
				-         {
			
 
				-                 if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
			
 
				-                 cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
			
 
				-                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                 clReleaseEvent((*async_channel).event.opencl_event);
			
 
				-         }
			
 
				-         break;
			
 
				+	case STARPU_OPENCL_RAM:
			
 
				+	{
			
 
				+		if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
			
 
				+		cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		clReleaseEvent((*async_channel).event.opencl_event);
			
 
				+	      break;
			
 
				+	}
			
 
				 #endif
			
 
				-		case STARPU_CPU_RAM:
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				+	case STARPU_CPU_RAM:
			
 
				+	default:
			
 
				+		STARPU_ABORT();
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel)
			
 
				+unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel)
			
 
				 {
			
 
				-	starpu_node_kind kind = async_channel->type;
			
 
				+	enum starpu_node_kind kind = async_channel->type;
			
 
				 	unsigned success;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t event;
			
 
				 #endif
			
 
				 
			
 
				-	switch (kind) {
			
 
				+	switch (kind)
			
 
				+	{
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-		case STARPU_CUDA_RAM:
			
 
				-			event = (*async_channel).event.cuda_event;
			
 
				-			CUresult cures = cudaEventQuery(event);
			
 
				-
			
 
				-			success = (cures == cudaSuccess);
			
 
				-			if (success)
			
 
				-				cudaEventDestroy(event);
			
 
				-			else if (cures != cudaErrorNotReady)
			
 
				-				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-			break;
			
 
				+	case STARPU_CUDA_RAM:
			
 
				+		event = (*async_channel).event.cuda_event;
			
 
				+		cudaError_t cures = cudaEventQuery(event);
			
 
				+
			
 
				+		success = (cures == cudaSuccess);
			
 
				+		if (success)
			
 
				+			cudaEventDestroy(event);
			
 
				+		else if (cures != cudaErrorNotReady)
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+		break;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-      case STARPU_OPENCL_RAM:
			
 
				-         {
			
 
				-            cl_int event_status;
			
 
				-            cl_event opencl_event = (*async_channel).event.opencl_event;
			
 
				-            if (opencl_event == NULL) STARPU_ABORT();
			
 
				-            cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
			
 
				-            if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-            success = (event_status == CL_COMPLETE);
			
 
				-            break;
			
 
				-         }
			
 
				+	case STARPU_OPENCL_RAM:
			
 
				+	{
			
 
				+		cl_int event_status;
			
 
				+		cl_event opencl_event = (*async_channel).event.opencl_event;
			
 
				+		if (opencl_event == NULL) STARPU_ABORT();
			
 
				+		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		success = (event_status == CL_COMPLETE);
			
 
				+		break;
			
 
				+	}
			
 
				 #endif
			
 
				-		case STARPU_CPU_RAM:
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				-			success = 0;
			
 
				+	case STARPU_CPU_RAM:
			
 
				+	default:
			
 
				+		STARPU_ABORT();
			
 
				+		success = 0;
			
 
				 	}
			
 
				 
			
 
				 	return success;
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -33,12 +33,13 @@
 
				 #include <starpu_opencl.h>
			
 
				 #endif
			
 
				 
			
 
				-struct starpu_data_request_s;
			
 
				-struct starpu_data_replicate_s;
			
 
				+struct _starpu_data_request;
			
 
				+struct _starpu_data_replicate;
			
 
				 
			
 
				 /* this is a structure that can be queried to see whether an asynchronous
			
 
				  * transfer has terminated or not */
			
 
				-typedef union {
			
 
				+union _starpu_async_channel_event
			
 
				+{
			
 
				 	int dummy;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t cuda_event;
			
@@ -46,22 +47,23 @@ typedef union {
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         cl_event opencl_event;
			
 
				 #endif
			
 
				-} starpu_async_channel_event;
			
 
				+};
			
 
				 
			
 
				-struct starpu_async_channel {
			
 
				-	starpu_async_channel_event event;
			
 
				-	starpu_node_kind type;
			
 
				+struct _starpu_async_channel
			
 
				+{
			
 
				+	union _starpu_async_channel_event event;
			
 
				+	enum starpu_node_kind type;
			
 
				 };
			
 
				 
			
 
				 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
			
 
				 
			
 
				-int _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
			
 
				-					struct starpu_data_replicate_s *src_replicate,
			
 
				-					struct starpu_data_replicate_s *dst_replicate,
			
 
				-					unsigned donotread,
			
 
				-					struct starpu_data_request_s *req,
			
 
				-					unsigned may_alloc);
			
 
				+int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
			
 
				+				    struct _starpu_data_replicate *src_replicate,
			
 
				+				    struct _starpu_data_replicate *dst_replicate,
			
 
				+				    unsigned donotread,
			
 
				+				    struct _starpu_data_request *req,
			
 
				+				    unsigned may_alloc);
			
 
				 
			
 
				-unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel);
			
 
				-void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel);
			
 
				+unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
			
 
				+void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
			
 
				 #endif // __COPY_DRIVER_H__
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,27 +20,29 @@
 
				 #include <datawizard/datawizard.h>
			
 
				 
			
 
				 /* requests that have not been treated at all */
			
 
				-static starpu_data_request_list_t data_requests[STARPU_MAXNODES];
			
 
				-static pthread_cond_t data_requests_list_cond[STARPU_MAXNODES];
			
 
				+static struct _starpu_data_request_list *data_requests[STARPU_MAXNODES];
			
 
				+static struct _starpu_data_request_list *prefetch_requests[STARPU_MAXNODES];
			
 
				 static pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
			
 
				 
			
 
				 /* requests that are not terminated (eg. async transfers) */
			
 
				-static starpu_data_request_list_t data_requests_pending[STARPU_MAXNODES];
			
 
				-static pthread_cond_t data_requests_pending_list_cond[STARPU_MAXNODES];
			
 
				+static struct _starpu_data_request_list *data_requests_pending[STARPU_MAXNODES];
			
 
				 static pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
			
 
				 
			
 
				+int starpu_memstrategy_drop_prefetch[STARPU_MAXNODES];
			
 
				+
			
 
				 void _starpu_init_data_request_lists(void)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				 	{
			
 
				-		data_requests[i] = starpu_data_request_list_new();
			
 
				-		PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
			
 
				-		PTHREAD_COND_INIT(&data_requests_list_cond[i], NULL);
			
 
				+		prefetch_requests[i] = _starpu_data_request_list_new();
			
 
				+		data_requests[i] = _starpu_data_request_list_new();
			
 
				+		_STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
			
 
				+
			
 
				+		data_requests_pending[i] = _starpu_data_request_list_new();
			
 
				+		_STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
			
 
				 
			
 
				-		data_requests_pending[i] = starpu_data_request_list_new();
			
 
				-		PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
			
 
				-		PTHREAD_COND_INIT(&data_requests_pending_list_cond[i], NULL);
			
 
				+		starpu_memstrategy_drop_prefetch[i]=0;
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -49,18 +51,17 @@ void _starpu_deinit_data_request_lists(void)
 
				 	unsigned i;
			
 
				 	for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				 	{
			
 
				-		PTHREAD_COND_DESTROY(&data_requests_pending_list_cond[i]);
			
 
				-		PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
			
 
				-		starpu_data_request_list_delete(data_requests_pending[i]);
			
 
				+		_STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
			
 
				+		_starpu_data_request_list_delete(data_requests_pending[i]);
			
 
				 
			
 
				-		PTHREAD_COND_DESTROY(&data_requests_list_cond[i]);
			
 
				-		PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
			
 
				-		starpu_data_request_list_delete(data_requests[i]);
			
 
				+		_STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
			
 
				+		_starpu_data_request_list_delete(data_requests[i]);
			
 
				+		_starpu_data_request_list_delete(prefetch_requests[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 /* this should be called with the lock r->handle->header_lock taken */
			
 
				-static void starpu_data_request_destroy(starpu_data_request_t r)
			
 
				+static void starpu_data_request_destroy(struct _starpu_data_request *r)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 
			
@@ -71,25 +72,27 @@ static void starpu_data_request_destroy(starpu_data_request_t r)
 
				 	{
			
 
				 		node = r->src_replicate->memory_node;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		node = r->dst_replicate->memory_node;
			
 
				 	}
			
 
				 
			
 
				 	STARPU_ASSERT(r->dst_replicate->request[node] == r);
			
 
				 	r->dst_replicate->request[node] = NULL;
			
 
				 	//fprintf(stderr, "DESTROY REQ %p (%d) refcnt %d\n", r, node, r->refcnt);
			
 
				-	starpu_data_request_delete(r);
			
 
				+	_starpu_data_request_delete(r);
			
 
				 }
			
 
				 
			
 
				 /* handle->lock should already be taken !  */
			
 
				-starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
			
 
				-				struct starpu_data_replicate_s *src_replicate,
			
 
				-				struct starpu_data_replicate_s *dst_replicate,
			
 
				-				uint32_t handling_node,
			
 
				-				starpu_access_mode mode,
			
 
				-				unsigned ndeps)
			
 
				+struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
			
 
				+							 struct _starpu_data_replicate *src_replicate,
			
 
				+							 struct _starpu_data_replicate *dst_replicate,
			
 
				+							 uint32_t handling_node,
			
 
				+							 enum starpu_access_mode mode,
			
 
				+							 unsigned ndeps,
			
 
				+							 unsigned is_prefetch)
			
 
				 {
			
 
				-	starpu_data_request_t r = starpu_data_request_new();
			
 
				+	struct _starpu_data_request *r = _starpu_data_request_new();
			
 
				 
			
 
				 	_starpu_spin_init(&r->lock);
			
 
				 
			
@@ -99,6 +102,7 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
				 	r->mode = mode;
			
 
				 	r->handling_node = handling_node;
			
 
				 	r->completed = 0;
			
 
				+	r->prefetch = is_prefetch;
			
 
				 	r->retval = -1;
			
 
				 	r->ndeps = ndeps;
			
 
				 	r->next_req_count = 0;
			
@@ -106,15 +110,20 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
				 
			
 
				 	_starpu_spin_lock(&r->lock);
			
 
				 
			
 
				+	/* Take a reference on the target for the request to be able to write it */
			
 
				 	dst_replicate->refcnt++;
			
 
				+	handle->busy_count++;
			
 
				 
			
 
				 	if (mode & STARPU_R)
			
 
				 	{
			
 
				 		unsigned src_node = src_replicate->memory_node;
			
 
				 		dst_replicate->request[src_node] = r;
			
 
				+		/* Take a reference on the source for the request to be able to read it */
			
 
				 		src_replicate->refcnt++;
			
 
				+		handle->busy_count++;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		unsigned dst_node = dst_replicate->memory_node;
			
 
				 		dst_replicate->request[dst_node] = r;
			
 
				 	}
			
@@ -126,14 +135,15 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc)
			
 
				+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
			
 
				 {
			
 
				 	int retval;
			
 
				 	int do_delete = 0;
			
 
				 
			
 
				 	uint32_t local_node = _starpu_get_local_memory_node();
			
 
				 
			
 
				-	do {
			
 
				+	do
			
 
				+	{
			
 
				 		_starpu_spin_lock(&r->lock);
			
 
				 
			
 
				 		if (r->completed)
			
@@ -147,13 +157,14 @@ int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_a
 
				 
			
 
				 		_starpu_datawizard_progress(local_node, may_alloc);
			
 
				 
			
 
				-	} while (1);
			
 
				+	}
			
 
				+	while (1);
			
 
				 
			
 
				 
			
 
				 	retval = r->retval;
			
 
				 	if (retval)
			
 
				 		_STARPU_DISP("REQUEST %p COMPLETED (retval %d) !\n", r, r->retval);
			
 
				-		
			
 
				+
			
 
				 
			
 
				 	r->refcnt--;
			
 
				 
			
@@ -162,15 +173,15 @@ int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_a
 
				 		do_delete = 1;
			
 
				 
			
 
				 	_starpu_spin_unlock(&r->lock);
			
 
				-	
			
 
				+
			
 
				 	if (do_delete)
			
 
				 		starpu_data_request_destroy(r);
			
 
				-	
			
 
				+
			
 
				 	return retval;
			
 
				 }
			
 
				 
			
 
				 /* this is non blocking */
			
 
				-void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node)
			
 
				+void _starpu_post_data_request(struct _starpu_data_request *r, uint32_t handling_node)
			
 
				 {
			
 
				 //	_STARPU_DEBUG("POST REQUEST\n");
			
 
				 
			
@@ -185,9 +196,12 @@ void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node)
 
				 	}
			
 
				 
			
 
				 	/* insert the request in the proper list */
			
 
				-	PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
			
 
				-	starpu_data_request_list_push_front(data_requests[handling_node], r);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
			
 
				+	if (r->prefetch)
			
 
				+		_starpu_data_request_list_push_back(prefetch_requests[handling_node], r);
			
 
				+	else
			
 
				+		_starpu_data_request_list_push_back(data_requests[handling_node], r);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
			
 
				 
			
 
				 #ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	_starpu_wake_all_blocked_workers_on_node(handling_node);
			
@@ -195,13 +209,13 @@ void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node)
 
				 }
			
 
				 
			
 
				 /* We assume that r->lock is taken by the caller */
			
 
				-void _starpu_data_request_append_callback(starpu_data_request_t r, void (*callback_func)(void *), void *callback_arg)
			
 
				+void _starpu_data_request_append_callback(struct _starpu_data_request *r, void (*callback_func)(void *), void *callback_arg)
			
 
				 {
			
 
				 	STARPU_ASSERT(r);
			
 
				 
			
 
				 	if (callback_func)
			
 
				 	{
			
 
				-		struct callback_list *link = (struct callback_list *) malloc(sizeof(struct callback_list));
			
 
				+		struct _starpu_callback_list *link = (struct _starpu_callback_list *) malloc(sizeof(struct _starpu_callback_list));
			
 
				 		STARPU_ASSERT(link);
			
 
				 
			
 
				 		link->callback_func = callback_func;
			
@@ -212,22 +226,46 @@ void _starpu_data_request_append_callback(starpu_data_request_t r, void (*callba
 
				 }
			
 
				 
			
 
				 /* This method is called with handle's header_lock taken */
			
 
				-static void starpu_handle_data_request_completion(starpu_data_request_t r)
			
 
				+static void starpu_handle_data_request_completion(struct _starpu_data_request *r)
			
 
				 {
			
 
				 	unsigned do_delete = 0;
			
 
				-	starpu_data_handle handle = r->handle;
			
 
				-	starpu_access_mode mode = r->mode;
			
 
				+	starpu_data_handle_t handle = r->handle;
			
 
				+	enum starpu_access_mode mode = r->mode;
			
 
				 
			
 
				-	struct starpu_data_replicate_s *src_replicate = r->src_replicate;
			
 
				-	struct starpu_data_replicate_s *dst_replicate = r->dst_replicate;
			
 
				+	struct _starpu_data_replicate *src_replicate = r->src_replicate;
			
 
				+	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
			
 
				 
			
 
				+
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+	enum _starpu_cache_state old_src_replicate_state = src_replicate->state;
			
 
				+#endif
			
 
				 	_starpu_update_data_state(handle, r->dst_replicate, mode);
			
 
				 
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+	if (src_replicate->state == STARPU_INVALID)
			
 
				+	{
			
 
				+		if (old_src_replicate_state == STARPU_OWNER)
			
 
				+			_starpu_handle_stats_invalidated(handle, src_replicate->memory_node);
			
 
				+		else
			
 
				+		{
			
 
				+			/* XXX Currently only ex-OWNER are tagged as invalidated */
			
 
				+			/* XXX Have to check all old state of every node in case a SHARED data become OWNED by the dst_replicate */
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+	if (dst_replicate->state == STARPU_SHARED)
			
 
				+		_starpu_handle_stats_loaded_shared(handle, dst_replicate->memory_node);
			
 
				+	else if (dst_replicate->state == STARPU_OWNER)
			
 
				+	{
			
 
				+		_starpu_handle_stats_loaded_owner(handle, dst_replicate->memory_node);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	uint32_t src_node = src_replicate->memory_node;
			
 
				 	uint32_t dst_node = dst_replicate->memory_node;
			
 
				 	size_t size = _starpu_data_get_size(handle);
			
 
				-	STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, r->com_id);
			
 
				+	_STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, r->com_id);
			
 
				 #endif
			
 
				 
			
 
				 	/* Once the request has been fulfilled, we may submit the requests that
			
@@ -235,36 +273,42 @@ static void starpu_handle_data_request_completion(starpu_data_request_t r)
 
				 	unsigned chained_req;
			
 
				 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
			
 
				 	{
			
 
				-		struct starpu_data_request_s *next_req = r->next_req[chained_req];
			
 
				+		struct _starpu_data_request *next_req = r->next_req[chained_req];
			
 
				 		STARPU_ASSERT(next_req->ndeps > 0);
			
 
				 		next_req->ndeps--;
			
 
				 		_starpu_post_data_request(next_req, next_req->handling_node);
			
 
				 	}
			
 
				 
			
 
				 	r->completed = 1;
			
 
				-	
			
 
				-	/* Remove a reference on the destination replicate  */
			
 
				+
			
 
				+	/* Remove a reference on the destination replicate for the request */
			
 
				 	STARPU_ASSERT(dst_replicate->refcnt > 0);
			
 
				 	dst_replicate->refcnt--;
			
 
				+	STARPU_ASSERT(handle->busy_count > 0);
			
 
				+	handle->busy_count--;
			
 
				 
			
 
				 	/* In case the source was "locked" by the request too */
			
 
				 	if (mode & STARPU_R)
			
 
				 	{
			
 
				 		STARPU_ASSERT(src_replicate->refcnt > 0);
			
 
				 		src_replicate->refcnt--;
			
 
				+		STARPU_ASSERT(handle->busy_count > 0);
			
 
				+		handle->busy_count--;
			
 
				 	}
			
 
				 
			
 
				+	_starpu_data_check_not_busy(handle);
			
 
				+
			
 
				 	r->refcnt--;
			
 
				 
			
 
				 	/* if nobody is waiting on that request, we can get rid of it */
			
 
				 	if (r->refcnt == 0)
			
 
				 		do_delete = 1;
			
 
				-	
			
 
				+
			
 
				 	r->retval = 0;
			
 
				 
			
 
				 	/* In case there are one or multiple callbacks, we execute them now. */
			
 
				-	struct callback_list *callbacks = r->callbacks;
			
 
				-	
			
 
				+	struct _starpu_callback_list *callbacks = r->callbacks;
			
 
				+
			
 
				 	_starpu_spin_unlock(&r->lock);
			
 
				 
			
 
				 	if (do_delete)
			
@@ -278,24 +322,24 @@ static void starpu_handle_data_request_completion(starpu_data_request_t r)
 
				 	{
			
 
				 		callbacks->callback_func(callbacks->callback_arg);
			
 
				 
			
 
				-		struct callback_list *next = callbacks->next;
			
 
				+		struct _starpu_callback_list *next = callbacks->next;
			
 
				 		free(callbacks);
			
 
				 		callbacks = next;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 /* TODO : accounting to see how much time was spent working for other people ... */
			
 
				-static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_alloc)
			
 
				+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc)
			
 
				 {
			
 
				-	starpu_data_handle handle = r->handle;
			
 
				+	starpu_data_handle_t handle = r->handle;
			
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 	_starpu_spin_lock(&r->lock);
			
 
				 
			
 
				-	struct starpu_data_replicate_s *src_replicate = r->src_replicate;
			
 
				-	struct starpu_data_replicate_s *dst_replicate = r->dst_replicate;
			
 
				+	struct _starpu_data_replicate *src_replicate = r->src_replicate;
			
 
				+	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
			
 
				 
			
 
				-	starpu_access_mode r_mode = r->mode;
			
 
				+	enum starpu_access_mode r_mode = r->mode;
			
 
				 
			
 
				 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
			
 
				 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
			
@@ -307,7 +351,7 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 
				 	/* the header of the data must be locked by the worker that submitted the request */
			
 
				 
			
 
				 	r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
			
 
				-			dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
			
 
				+						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
			
 
				 
			
 
				 	if (r->retval == -ENOMEM)
			
 
				 	{
			
@@ -325,9 +369,9 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 
				 		 * requests in the meantime. */
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				-		PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				-		starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
			
 
				-		PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				+		_starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				 
			
 
				 		return -EAGAIN;
			
 
				 	}
			
@@ -341,17 +385,18 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 
				 
			
 
				 void _starpu_handle_node_data_requests(uint32_t src_node, unsigned may_alloc)
			
 
				 {
			
 
				-	starpu_data_request_t r;
			
 
				+	struct _starpu_data_request *r;
			
 
				+	struct _starpu_data_request_list *new_data_requests;
			
 
				 
			
 
				 	/* take all the entries from the request list */
			
 
				-        PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				+        _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				 
			
 
				-	starpu_data_request_list_t local_list = data_requests[src_node];
			
 
				+	struct _starpu_data_request_list *local_list = data_requests[src_node];
			
 
				 
			
 
				-	if (starpu_data_request_list_empty(local_list))
			
 
				+	if (_starpu_data_request_list_empty(local_list))
			
 
				 	{
			
 
				 		/* there is no request */
			
 
				-                PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+                _STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				 
			
 
				 		return;
			
 
				 	}
			
@@ -359,83 +404,161 @@ void _starpu_handle_node_data_requests(uint32_t src_node, unsigned may_alloc)
 
				 	/* There is an entry: we create a new empty list to replace the list of
			
 
				 	 * requests, and we handle the request(s) one by one in the former
			
 
				 	 * list, without concurrency issues.*/
			
 
				-	data_requests[src_node] = starpu_data_request_list_new();
			
 
				+	data_requests[src_node] = _starpu_data_request_list_new();
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+
			
 
				+	new_data_requests = _starpu_data_request_list_new();
			
 
				 
			
 
				 	/* for all entries of the list */
			
 
				-	while (!starpu_data_request_list_empty(local_list))
			
 
				+	while (!_starpu_data_request_list_empty(local_list))
			
 
				 	{
			
 
				                 int res;
			
 
				 
			
 
				-		r = starpu_data_request_list_pop_back(local_list);
			
 
				+		r = _starpu_data_request_list_pop_front(local_list);
			
 
				 
			
 
				 		res = starpu_handle_data_request(r, may_alloc);
			
 
				 		if (res == -ENOMEM)
			
 
				 		{
			
 
				-                        PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				-			starpu_data_request_list_push_front(data_requests[src_node], r);
			
 
				-			PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+			_starpu_data_request_list_push_back(new_data_requests, r);
			
 
				 		}
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				+	_starpu_data_request_list_push_list_front(new_data_requests, data_requests[src_node]);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+
			
 
				+	_starpu_data_request_list_delete(new_data_requests);
			
 
				+	_starpu_data_request_list_delete(local_list);
			
 
				+}
			
 
				+
			
 
				+void _starpu_handle_node_prefetch_requests(uint32_t src_node, unsigned may_alloc)
			
 
				+{
			
 
				+	starpu_memstrategy_drop_prefetch[src_node]=0;
			
 
				+
			
 
				+	struct _starpu_data_request *r;
			
 
				+	struct _starpu_data_request_list *new_data_requests;
			
 
				+	struct _starpu_data_request_list *new_prefetch_requests;
			
 
				 
			
 
				-		/* wake the requesting worker up */
			
 
				-		// if we do not progress ..
			
 
				-		// pthread_cond_broadcast(&data_requests_list_cond[src_node]);
			
 
				+	/* take all the entries from the request list */
			
 
				+        _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				+
			
 
				+	struct _starpu_data_request_list *local_list = prefetch_requests[src_node];
			
 
				+
			
 
				+	if (_starpu_data_request_list_empty(local_list))
			
 
				+	{
			
 
				+		/* there is no request */
			
 
				+                _STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				-	starpu_data_request_list_delete(local_list);
			
 
				+	/* There is an entry: we create a new empty list to replace the list of
			
 
				+	 * requests, and we handle the request(s) one by one in the former
			
 
				+	 * list, without concurrency issues.*/
			
 
				+	prefetch_requests[src_node] = _starpu_data_request_list_new();
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+
			
 
				+	new_data_requests = _starpu_data_request_list_new();
			
 
				+	new_prefetch_requests = _starpu_data_request_list_new();
			
 
				+
			
 
				+	/* for all entries of the list */
			
 
				+	while (!_starpu_data_request_list_empty(local_list))
			
 
				+	{
			
 
				+                int res;
			
 
				+
			
 
				+		r = _starpu_data_request_list_pop_front(local_list);
			
 
				+
			
 
				+		res = starpu_handle_data_request(r, may_alloc);
			
 
				+		if (res == -ENOMEM )
			
 
				+		{
			
 
				+			starpu_memstrategy_drop_prefetch[src_node]=1;
			
 
				+			if (r->prefetch)
			
 
				+				_starpu_data_request_list_push_back(new_prefetch_requests, r);
			
 
				+			else
			
 
				+			{
			
 
				+				/* Prefetch request promoted while in tmp list*/
			
 
				+				_starpu_data_request_list_push_back(new_data_requests, r);
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	while(!_starpu_data_request_list_empty(local_list) && starpu_memstrategy_drop_prefetch[src_node])
			
 
				+	{
			
 
				+		r = _starpu_data_request_list_pop_front(local_list);
			
 
				+		if (r->prefetch)
			
 
				+			_starpu_data_request_list_push_back(new_prefetch_requests, r);
			
 
				+		else
			
 
				+			_starpu_data_request_list_push_back(new_data_requests, r);
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				+	_starpu_data_request_list_push_list_front(new_data_requests, data_requests[src_node]);
			
 
				+	_starpu_data_request_list_push_list_front(new_prefetch_requests, prefetch_requests[src_node]);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+
			
 
				+	_starpu_data_request_list_delete(new_data_requests);
			
 
				+	_starpu_data_request_list_delete(new_prefetch_requests);
			
 
				+	_starpu_data_request_list_delete(local_list);
			
 
				 }
			
 
				 
			
 
				 static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force)
			
 
				 {
			
 
				 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
			
 
				+//
			
 
				+	struct _starpu_data_request_list *new_data_requests_pending = _starpu_data_request_list_new();
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				 
			
 
				 	/* for all entries of the list */
			
 
				-	starpu_data_request_list_t local_list = data_requests_pending[src_node];
			
 
				-	data_requests_pending[src_node] = starpu_data_request_list_new();
			
 
				+	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];
			
 
				+	data_requests_pending[src_node] = _starpu_data_request_list_new();
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				 
			
 
				-	while (!starpu_data_request_list_empty(local_list))
			
 
				+	while (!_starpu_data_request_list_empty(local_list))
			
 
				 	{
			
 
				-		starpu_data_request_t r;
			
 
				-		r = starpu_data_request_list_pop_back(local_list);
			
 
				+		struct _starpu_data_request *r;
			
 
				+		r = _starpu_data_request_list_pop_front(local_list);
			
 
				+
			
 
				+		starpu_data_handle_t handle = r->handle;
			
 
				 
			
 
				-		starpu_data_handle handle = r->handle;
			
 
				-		
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				-	
			
 
				+
			
 
				 		_starpu_spin_lock(&r->lock);
			
 
				-	
			
 
				+
			
 
				 		/* wait until the transfer is terminated */
			
 
				 		if (force)
			
 
				 		{
			
 
				 			_starpu_driver_wait_request_completion(&r->async_channel);
			
 
				 			starpu_handle_data_request_completion(r);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			if (_starpu_driver_test_request_completion(&r->async_channel))
			
 
				 			{
			
 
				 				/* The request was completed */
			
 
				 				starpu_handle_data_request_completion(r);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				/* The request was not completed, so we put it
			
 
				 				 * back again on the list of pending requests
			
 
				 				 * so that it can be handled later on. */
			
 
				 				_starpu_spin_unlock(&r->lock);
			
 
				 				_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				-				PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				-				starpu_data_request_list_push_front(data_requests_pending[src_node], r);
			
 
				-				PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+				_starpu_data_request_list_push_back(new_data_requests_pending, r);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+	_starpu_data_request_list_push_list_back(data_requests_pending[src_node], new_data_requests_pending);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				 
			
 
				-	starpu_data_request_list_delete(local_list);
			
 
				+	_starpu_data_request_list_delete(local_list);
			
 
				+	_starpu_data_request_list_delete(new_data_requests_pending);
			
 
				 }
			
 
				 
			
 
				 void _starpu_handle_pending_node_data_requests(uint32_t src_node)
			
@@ -451,8 +574,43 @@ void _starpu_handle_all_pending_node_data_requests(uint32_t src_node)
 
				 int _starpu_check_that_no_data_request_exists(uint32_t node)
			
 
				 {
			
 
				 	/* XXX lock that !!! that's a quick'n'dirty test */
			
 
				-	int no_request = starpu_data_request_list_empty(data_requests[node]);
			
 
				-	int no_pending = starpu_data_request_list_empty(data_requests_pending[node]);
			
 
				+	int no_request = _starpu_data_request_list_empty(data_requests[node]);
			
 
				+	int no_pending = _starpu_data_request_list_empty(data_requests_pending[node]);
			
 
				 
			
 
				 	return (no_request && no_pending);
			
 
				 }
			
 
				+
			
 
				+
			
 
				+void _starpu_update_prefetch_status(struct _starpu_data_request *r)
			
 
				+{
			
 
				+	STARPU_ASSERT(r->prefetch > 0);
			
 
				+	r->prefetch=0;
			
 
				+
			
 
				+	/* We have to promote chained_request too! */
			
 
				+	unsigned chained_req;
			
 
				+	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
			
 
				+	{
			
 
				+		struct _starpu_data_request *next_req = r->next_req[chained_req];
			
 
				+		if (next_req->prefetch)
			
 
				+			_starpu_update_prefetch_status(next_req);
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
			
 
				+
			
 
				+	/* The request can be in a different list (handling request or the temp list)
			
 
				+	 * we have to check that it is really in the prefetch list. */
			
 
				+	struct _starpu_data_request *r_iter;
			
 
				+	for (r_iter = _starpu_data_request_list_begin(prefetch_requests[r->handling_node]);
			
 
				+	     r_iter != _starpu_data_request_list_end(prefetch_requests[r->handling_node]);
			
 
				+	     r_iter = _starpu_data_request_list_next(r_iter))
			
 
				+	{
			
 
				+
			
 
				+		if (r==r_iter)
			
 
				+		{
			
 
				+			_starpu_data_request_list_erase(prefetch_requests[r->handling_node],r);
			
 
				+			_starpu_data_request_list_push_front(data_requests[r->handling_node],r);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node]);
			
 
				+}
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,29 +24,31 @@
 
				 #include <common/list.h>
			
 
				 #include <common/starpu_spinlock.h>
			
 
				 
			
 
				-struct starpu_data_replicate_s;
			
 
				+struct _starpu_data_replicate;
			
 
				 
			
 
				-struct callback_list {
			
 
				+struct _starpu_callback_list
			
 
				+{
			
 
				 	void (*callback_func)(void *);
			
 
				 	void *callback_arg;
			
 
				-	struct callback_list *next;
			
 
				+	struct _starpu_callback_list *next;
			
 
				 };
			
 
				 
			
 
				-LIST_TYPE(starpu_data_request,
			
 
				-	starpu_spinlock_t lock;
			
 
				+LIST_TYPE(_starpu_data_request,
			
 
				+	struct _starpu_spinlock lock;
			
 
				 	unsigned refcnt;
			
 
				 
			
 
				-	starpu_data_handle handle;
			
 
				-	struct starpu_data_replicate_s *src_replicate;
			
 
				-	struct starpu_data_replicate_s *dst_replicate;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	struct _starpu_data_replicate *src_replicate;
			
 
				+	struct _starpu_data_replicate *dst_replicate;
			
 
				 
			
 
				 	uint32_t handling_node;
			
 
				 
			
 
				-	starpu_access_mode mode;
			
 
				+	enum starpu_access_mode mode;
			
 
				 
			
 
				-	struct starpu_async_channel async_channel;
			
 
				+	struct _starpu_async_channel async_channel;
			
 
				 
			
 
				 	unsigned completed;
			
 
				+	unsigned prefetch;
			
 
				 	int retval;
			
 
				 
			
 
				 	/* The request will not actually be submitted until there remains
			
@@ -54,28 +56,28 @@ LIST_TYPE(starpu_data_request,
 
				 	unsigned ndeps;
			
 
				 
			
 
				 	/* in case we have a chain of request (eg. for nvidia multi-GPU) */
			
 
				-	struct starpu_data_request_s *next_req[STARPU_MAXNODES];
			
 
				+	struct _starpu_data_request *next_req[STARPU_MAXNODES];
			
 
				 	/* who should perform the next request ? */
			
 
				 	unsigned next_req_count;
			
 
				 
			
 
				-	struct callback_list *callbacks;
			
 
				+	struct _starpu_callback_list *callbacks;
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	unsigned com_id;
			
 
				 #endif
			
 
				-);
			
 
				+)
			
 
				 
			
 
				 /* Everyone that wants to access some piece of data will post a request.
			
 
				  * Not only StarPU internals, but also the application may put such requests */
			
 
				-LIST_TYPE(starpu_data_requester,
			
 
				+LIST_TYPE(_starpu_data_requester,
			
 
				 	/* what kind of access is requested ? */
			
 
				-	starpu_access_mode mode;
			
 
				+	enum starpu_access_mode mode;
			
 
				 
			
 
				 	/* applications may also directly manipulate data */
			
 
				 	unsigned is_requested_by_codelet;
			
 
				 
			
 
				 	/* in case this is a codelet that will do the access */
			
 
				-	struct starpu_job_s *j;
			
 
				+	struct _starpu_job *j;
			
 
				 	unsigned buffer_index;
			
 
				 
			
 
				 	/* if this is more complicated ... (eg. application request) 
			
@@ -83,28 +85,32 @@ LIST_TYPE(starpu_data_requester,
 
				 	 */
			
 
				 	void (*ready_data_callback)(void *argcb);
			
 
				 	void *argcb;
			
 
				-);
			
 
				+)
			
 
				 
			
 
				 void _starpu_init_data_request_lists(void);
			
 
				 void _starpu_deinit_data_request_lists(void);
			
 
				-void _starpu_post_data_request(starpu_data_request_t r, uint32_t handling_node);
			
 
				+void _starpu_post_data_request(struct _starpu_data_request *r, uint32_t handling_node);
			
 
				 void _starpu_handle_node_data_requests(uint32_t src_node, unsigned may_alloc);
			
 
				+void _starpu_handle_node_prefetch_requests(uint32_t src_node, unsigned may_alloc);
			
 
				 
			
 
				 void _starpu_handle_pending_node_data_requests(uint32_t src_node);
			
 
				 void _starpu_handle_all_pending_node_data_requests(uint32_t src_node);
			
 
				 
			
 
				 int _starpu_check_that_no_data_request_exists(uint32_t node);
			
 
				 
			
 
				-starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
			
 
				-				struct starpu_data_replicate_s *src_replicate,
			
 
				-				struct starpu_data_replicate_s *dst_replicate,
			
 
				-				uint32_t handling_node,
			
 
				-				starpu_access_mode mode,
			
 
				-				unsigned ndeps);
			
 
				+struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
			
 
				+							 struct _starpu_data_replicate *src_replicate,
			
 
				+							 struct _starpu_data_replicate *dst_replicate,
			
 
				+							 uint32_t handling_node,
			
 
				+							 enum starpu_access_mode mode,
			
 
				+							 unsigned ndeps,
			
 
				+							 unsigned is_prefetch);
			
 
				 
			
 
				-int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc);
			
 
				+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
			
 
				 
			
 
				-void _starpu_data_request_append_callback(starpu_data_request_t r,
			
 
				-			void (*callback_func)(void *), void *callback_arg);
			
 
				+void _starpu_data_request_append_callback(struct _starpu_data_request *r,
			
 
				+					  void (*callback_func)(void *),
			
 
				+					  void *callback_arg);
			
 
				 
			
 
				+void _starpu_update_prefetch_status(struct _starpu_data_request *r);
			
 
				 #endif // __DATA_REQUEST_H__
			
--- a/src/datawizard/datastats.c
+++ b/src/datawizard/datastats.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,6 +17,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <datawizard/datastats.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				 #include <common/config.h>
			
 
				 
			
 
				 #ifdef STARPU_DATA_STATS
			
@@ -94,13 +95,13 @@ void _starpu_display_alloc_cache_stats(void)
 
				 #ifdef STARPU_DATA_STATS
			
 
				 	fprintf(stderr, "Allocation cache stats:\n");
			
 
				 	unsigned node;
			
 
				-	for (node = 0; node < STARPU_MAXNODES; node++) 
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		if (alloc_cnt[node]) 
			
 
				+		if (alloc_cnt[node])
			
 
				 		{
			
 
				 			fprintf(stderr, "memory node %d\n", node);
			
 
				 			fprintf(stderr, "\ttotal alloc : %u\n", alloc_cnt[node]);
			
 
				-			fprintf(stderr, "\tcached alloc: %u (%2.2f \%%)\n", 
			
 
				+			fprintf(stderr, "\tcached alloc: %u (%2.2f \%%)\n",
			
 
				 				alloc_cache_hit_cnt[node], (100.0f*alloc_cache_hit_cnt[node])/(alloc_cnt[node]));
			
 
				 		}
			
 
				 	}
			
@@ -109,38 +110,106 @@ void _starpu_display_alloc_cache_stats(void)
 
				 
			
 
				 /* measure the amount of data transfers between each pair of nodes */
			
 
				 #ifdef STARPU_DATA_STATS
			
 
				+static size_t comm_amount[STARPU_MAXNODES][STARPU_MAXNODES];
			
 
				+#endif /* STARPU_DATA_STATS */
			
 
				 
			
 
				-static size_t comm_ammount[STARPU_MAXNODES][STARPU_MAXNODES];
			
 
				+void _starpu_comm_amounts_inc(unsigned src  __attribute__ ((unused)), unsigned dst  __attribute__ ((unused)), size_t size  __attribute__ ((unused)))
			
 
				+{
			
 
				+#ifdef STARPU_DATA_STATS
			
 
				+	comm_amount[src][dst] += size;
			
 
				+#endif /* STARPU_DATA_STATS */
			
 
				+}
			
 
				 
			
 
				 void _starpu_display_comm_amounts(void)
			
 
				 {
			
 
				+#ifdef STARPU_DATA_STATS
			
 
				 	unsigned src, dst;
			
 
				 
			
 
				-	unsigned long sum = 0;
			
 
				+	size_t sum = 0;
			
 
				+
			
 
				+	for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
 
				+		for (src = 0; src < STARPU_MAXNODES; src++)
			
 
				+		{
			
 
				+			sum += comm_amount[src][dst];
			
 
				+			sum += comm_amount[dst][src];
			
 
				+		}
			
 
				+
			
 
				+	fprintf(stderr, "\nData transfers stats:\nTOTAL transfers %f MB\n", (float)sum/1024/1024);
			
 
				 
			
 
				 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
 
				-	for (src = 0; src < STARPU_MAXNODES; src++)
			
 
				+		for (src = dst + 1; src < STARPU_MAXNODES; src++)
			
 
				+		{
			
 
				+			if (comm_amount[src][dst])
			
 
				+				fprintf(stderr, "\t%d <-> %d\t%f MB\n\t\t%d -> %d\t%f MB\n\t\t%d -> %d\t%f MB\n",
			
 
				+					src, dst, ((float)comm_amount[src][dst] + (float)comm_amount[dst][src])/(1024*1024),
			
 
				+					src, dst, ((float)comm_amount[src][dst])/(1024*1024),
			
 
				+					dst, src, ((float)comm_amount[dst][src])/(1024*1024));
			
 
				+		}
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+void _starpu_display_data_stats(void)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		sum += (unsigned long)comm_ammount[src][dst];
			
 
				+		_starpu_display_data_stats_by_node(node);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	fprintf(stderr, "\nData transfers stats:\nTOTAL transfers %ld MB\n", sum/(1024*1024));
			
 
				+void _starpu_display_data_handle_stats(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	unsigned node;
			
 
				 
			
 
				-	for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
 
				-	for (src = dst + 1; src < STARPU_MAXNODES; src++)
			
 
				+	fprintf(stderr, "#-----\n");
			
 
				+	fprintf(stderr, "Data : %p\n", handle);
			
 
				+	fprintf(stderr, "Size : %d\n", (int)handle->data_size);
			
 
				+	fprintf(stderr, "\n");
			
 
				+
			
 
				+	fprintf(stderr, "#--\n");
			
 
				+	fprintf(stderr, "Data access stats\n");
			
 
				+	fprintf(stderr, "/!\\ Work Underway\n");
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		if (comm_ammount[src][dst])
			
 
				-			fprintf(stderr, "\t%d <-> %d\t%ld MB\n\t\t%d -> %d\t%ld MB\n\t\t%d -> %d\t%ld MB\n",
			
 
				-				src, dst, ((unsigned long)comm_ammount[src][dst] + (unsigned long)comm_ammount[dst][src])/(1024*1024),
			
 
				-				src, dst, ((unsigned long)comm_ammount[src][dst])/(1024*1024),
			
 
				-				dst, src, ((unsigned long)comm_ammount[dst][src])/(1024*1024));
			
 
				+		if (handle->stats_direct_access[node]+handle->stats_loaded_shared[node]
			
 
				+		    +handle->stats_invalidated[node]+handle->stats_loaded_owner[node])
			
 
				+		{
			
 
				+			fprintf(stderr, "Node #%d\n", node);
			
 
				+			fprintf(stderr, "\tDirect access : %d\n", handle->stats_direct_access[node]);
			
 
				+			/* XXX Not Working yet. */
			
 
				+			if (handle->stats_shared_to_owner[node])
			
 
				+				fprintf(stderr, "\t\tShared to Owner : %d\n", handle->stats_shared_to_owner[node]);
			
 
				+			fprintf(stderr, "\tLoaded (Owner) : %d\n", handle->stats_loaded_owner[node]);
			
 
				+			fprintf(stderr, "\tLoaded (Shared) : %d\n", handle->stats_loaded_shared[node]);
			
 
				+			fprintf(stderr, "\tInvalidated (was Owner) : %d\n\n", handle->stats_invalidated[node]);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-#else
			
 
				+void _starpu_handle_stats_cache_hit(starpu_data_handle_t handle, unsigned node)
			
 
				+{
			
 
				+	handle->stats_direct_access[node]++;
			
 
				+}
			
 
				 
			
 
				-void _starpu_display_comm_amounts(void)
			
 
				+void _starpu_handle_stats_loaded_shared(starpu_data_handle_t handle, unsigned node)
			
 
				+{
			
 
				+	handle->stats_loaded_shared[node]++;
			
 
				+}
			
 
				+
			
 
				+void _starpu_handle_stats_loaded_owner(starpu_data_handle_t handle, unsigned node)
			
 
				+{
			
 
				+	handle->stats_loaded_owner[node]++;
			
 
				+}
			
 
				+
			
 
				+void _starpu_handle_stats_shared_to_owner(starpu_data_handle_t handle, unsigned node)
			
 
				+{
			
 
				+	handle->stats_shared_to_owner[node]++;
			
 
				+}
			
 
				+
			
 
				+void _starpu_handle_stats_invalidated(starpu_data_handle_t handle, unsigned node)
			
 
				 {
			
 
				+	handle->stats_invalidated[node]++;
			
 
				 }
			
 
				 
			
 
				 #endif
			
--- a/src/datawizard/datastats.h
+++ b/src/datawizard/datastats.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,8 +31,18 @@ void _starpu_display_msi_stats(void);
 
				 void _starpu_allocation_cache_hit(unsigned node __attribute__ ((unused)));
			
 
				 void _starpu_data_allocation_inc_stats(unsigned node __attribute__ ((unused)));
			
 
				 
			
 
				-
			
 
				+void _starpu_comm_amounts_inc(unsigned src, unsigned dst, size_t size);
			
 
				 void _starpu_display_comm_amounts(void);
			
 
				 void _starpu_display_alloc_cache_stats(void);
			
 
				 
			
 
				+void _starpu_display_data_stats();
			
 
				+void _starpu_display_data_handle_stats(starpu_data_handle_t handle);
			
 
				+
			
 
				+void _starpu_handle_stats_cache_hit(starpu_data_handle_t handle, unsigned node);
			
 
				+void _starpu_handle_stats_loaded_shared(starpu_data_handle_t handle, unsigned node);
			
 
				+void _starpu_handle_stats_loaded_owner(starpu_data_handle_t handle, unsigned node);
			
 
				+void _starpu_handle_stats_shared_to_owner(starpu_data_handle_t handle, unsigned node);
			
 
				+void _starpu_handle_stats_invalidated(starpu_data_handle_t handle, unsigned node);
			
 
				+
			
 
				+
			
 
				 #endif // __DATASTATS_H__
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -1,8 +1,9 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012 INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,12 +20,12 @@
 
				 #include <datawizard/filters.h>
			
 
				 #include <datawizard/footprint.h>
			
 
				 
			
 
				-static void starpu_data_create_children(starpu_data_handle handle, unsigned nchildren, struct starpu_data_filter *f);
			
 
				+static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f);
			
 
				 
			
 
				 /*
			
 
				  * This function applies a data filter on all the elements of a partition
			
 
				  */
			
 
				-static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter *f)
			
 
				+static void map_filter(starpu_data_handle_t root_handle, struct starpu_data_filter *f)
			
 
				 {
			
 
				 	/* we need to apply the data filter on all leaf of the tree */
			
 
				 	if (root_handle->nchildren == 0)
			
@@ -32,7 +33,8 @@ static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter
 
				 		/* this is a leaf */
			
 
				 		starpu_data_partition(root_handle, f);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* try to apply the data filter recursively */
			
 
				 		unsigned child;
			
 
				 		for (child = 0; child < root_handle->nchildren; child++)
			
@@ -41,7 +43,7 @@ static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				-void starpu_data_vmap_filters(starpu_data_handle root_handle, unsigned nfilters, va_list pa)
			
 
				+void starpu_data_vmap_filters(starpu_data_handle_t root_handle, unsigned nfilters, va_list pa)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < nfilters; i++)
			
@@ -55,7 +57,7 @@ void starpu_data_vmap_filters(starpu_data_handle root_handle, unsigned nfilters,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters, ...)
			
 
				+void starpu_data_map_filters(starpu_data_handle_t root_handle, unsigned nfilters, ...)
			
 
				 {
			
 
				 	va_list pa;
			
 
				 	va_start(pa, nfilters);
			
@@ -63,12 +65,12 @@ void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters,
 
				 	va_end(pa);
			
 
				 }
			
 
				 
			
 
				-int starpu_data_get_nb_children(starpu_data_handle handle)
			
 
				+int starpu_data_get_nb_children(starpu_data_handle_t handle)
			
 
				 {
			
 
				         return handle->nchildren;
			
 
				 }
			
 
				 
			
 
				-starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i)
			
 
				+starpu_data_handle_t starpu_data_get_child(starpu_data_handle_t handle, unsigned i)
			
 
				 {
			
 
				 	STARPU_ASSERT(i < handle->nchildren);
			
 
				 
			
@@ -76,25 +78,25 @@ starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * example starpu_data_get_sub_data(starpu_data_handle root_handle, 3, 42, 0, 1);
			
 
				+ * example starpu_data_get_sub_data(starpu_data_handle_t root_handle, 3, 42, 0, 1);
			
 
				  */
			
 
				-starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_handle, unsigned depth, ... )
			
 
				+starpu_data_handle_t starpu_data_get_sub_data(starpu_data_handle_t root_handle, unsigned depth, ... )
			
 
				 {
			
 
				 	va_list pa;
			
 
				 	va_start(pa, depth);
			
 
				-	starpu_data_handle handle = starpu_data_vget_sub_data(root_handle, depth, pa);
			
 
				+	starpu_data_handle_t handle = starpu_data_vget_sub_data(root_handle, depth, pa);
			
 
				 	va_end(pa);
			
 
				 
			
 
				 	return handle;
			
 
				 }
			
 
				 
			
 
				-starpu_data_handle starpu_data_vget_sub_data(starpu_data_handle root_handle, unsigned depth, va_list pa )
			
 
				+starpu_data_handle_t starpu_data_vget_sub_data(starpu_data_handle_t root_handle, unsigned depth, va_list pa )
			
 
				 {
			
 
				 	STARPU_ASSERT(root_handle);
			
 
				-	starpu_data_handle current_handle = root_handle;
			
 
				+	starpu_data_handle_t current_handle = root_handle;
			
 
				 
			
 
				 	/* the variable number of argument must correlate the depth in the tree */
			
 
				-	unsigned i; 
			
 
				+	unsigned i;
			
 
				 	for (i = 0; i < depth; i++)
			
 
				 	{
			
 
				 		unsigned next_child;
			
@@ -108,16 +110,16 @@ starpu_data_handle starpu_data_vget_sub_data(starpu_data_handle root_handle, uns
 
				 	return current_handle;
			
 
				 }
			
 
				 
			
 
				-void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data_filter *f)
			
 
				+void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
			
 
				 {
			
 
				 	unsigned nparts;
			
 
				 	unsigned i;
			
 
				+	unsigned node;
			
 
				 
			
 
				 	/* first take care to properly lock the data header */
			
 
				 	_starpu_spin_lock(&initial_handle->header_lock);
			
 
				 
			
 
				-	/* there should not be mutiple filters applied on the same data */
			
 
				-	STARPU_ASSERT(initial_handle->nchildren == 0);
			
 
				+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data");
			
 
				 
			
 
				 	/* how many parts ? */
			
 
				 	if (f->get_nchildren)
			
@@ -132,9 +134,21 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+	{
			
 
				+		if (initial_handle->per_node[node].state != STARPU_INVALID)
			
 
				+			break;
			
 
				+	}
			
 
				+	if (node == STARPU_MAXNODES) {
			
 
				+		/* This is lazy allocation, allocate it now in main RAM, so as
			
 
				+		 * to have somewhere to gather pieces later */
			
 
				+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[0], 0);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				+	}
			
 
				+
			
 
				 	for (i = 0; i < nparts; i++)
			
 
				 	{
			
 
				-		starpu_data_handle child =
			
 
				+		starpu_data_handle_t child =
			
 
				 			starpu_data_get_child(initial_handle, i);
			
 
				 
			
 
				 		STARPU_ASSERT(child);
			
@@ -152,15 +166,19 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 		child->is_readonly = initial_handle->is_readonly;
			
 
				 
			
 
				 		/* initialize the chunk lock */
			
 
				-		child->req_list = starpu_data_requester_list_new();
			
 
				-		child->reduction_req_list = starpu_data_requester_list_new();
			
 
				+		child->req_list = _starpu_data_requester_list_new();
			
 
				+		child->reduction_req_list = _starpu_data_requester_list_new();
			
 
				 		child->refcnt = 0;
			
 
				+		child->busy_count = 0;
			
 
				+		child->busy_waiting = 0;
			
 
				+		_STARPU_PTHREAD_MUTEX_INIT(&child->busy_mutex, NULL);
			
 
				+		_STARPU_PTHREAD_COND_INIT(&child->busy_cond, NULL);
			
 
				 		child->reduction_refcnt = 0;
			
 
				 		_starpu_spin_init(&child->header_lock);
			
 
				 
			
 
				 		child->sequential_consistency = initial_handle->sequential_consistency;
			
 
				 
			
 
				-		PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
			
 
				+		_STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
			
 
				 		child->last_submitted_mode = STARPU_R;
			
 
				 		child->last_submitted_writer = NULL;
			
 
				 		child->last_submitted_readers = NULL;
			
@@ -178,11 +196,10 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 		child->last_submitted_ghost_readers_id = NULL;
			
 
				 #endif
			
 
				 
			
 
				-		unsigned node;
			
 
				 		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 		{
			
 
				-			struct starpu_data_replicate_s *initial_replicate; 
			
 
				-			struct starpu_data_replicate_s *child_replicate;
			
 
				+			struct _starpu_data_replicate *initial_replicate;
			
 
				+			struct _starpu_data_replicate *child_replicate;
			
 
				 
			
 
				 			initial_replicate = &initial_handle->per_node[node];
			
 
				 			child_replicate = &child->per_node[node];
			
@@ -193,7 +210,7 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 			child_replicate->refcnt = 0;
			
 
				 			child_replicate->memory_node = node;
			
 
				 			child_replicate->relaxed_coherency = 0;
			
 
				-			
			
 
				+
			
 
				 			/* update the interface */
			
 
				 			void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node);
			
 
				 			void *child_interface = starpu_data_get_interface_on_node(child, node);
			
@@ -204,9 +221,9 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 		unsigned worker;
			
 
				 		for (worker = 0; worker < nworkers; worker++)
			
 
				 		{
			
 
				-			struct starpu_data_replicate_s *child_replicate;
			
 
				+			struct _starpu_data_replicate *child_replicate;
			
 
				 			child_replicate = &child->per_worker[worker];
			
 
				-			
			
 
				+
			
 
				 			child_replicate->state = STARPU_INVALID;
			
 
				 			child_replicate->allocated = 0;
			
 
				 			child_replicate->automatically_allocated = 0;
			
@@ -242,39 +259,72 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 	_starpu_spin_unlock(&initial_handle->header_lock);
			
 
				 }
			
 
				 
			
 
				-void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_node)
			
 
				+static
			
 
				+void _starpu_empty_codelet_function(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) buffers; // unused;
			
 
				+	(void) args; // unused;
			
 
				+}
			
 
				+
			
 
				+void starpu_data_unpartition(starpu_data_handle_t root_handle, uint32_t gathering_node)
			
 
				 {
			
 
				 	unsigned child;
			
 
				 	unsigned node;
			
 
				 
			
 
				 	_starpu_spin_lock(&root_handle->header_lock);
			
 
				 
			
 
				+	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data is not partitioned");
			
 
				+
			
 
				 	/* first take all the children lock (in order !) */
			
 
				 	for (child = 0; child < root_handle->nchildren; child++)
			
 
				 	{
			
 
				-		struct starpu_data_state_t *child_handle = &root_handle->children[child];
			
 
				+		struct _starpu_data_state *child_handle = &root_handle->children[child];
			
 
				 
			
 
				 		/* make sure the intermediate children is unpartitionned as well */
			
 
				 		if (child_handle->nchildren > 0)
			
 
				 			starpu_data_unpartition(child_handle, gathering_node);
			
 
				 
			
 
				+		/* If this is a multiformat handle, we must convert the data now */
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning TODO: _starpu_fetch_data_on_node should be doing it
			
 
				+#endif
			
 
				+		if (_starpu_data_is_multiformat_handle(child_handle) &&
			
 
				+			starpu_node_get_kind(child_handle->mf_node) != STARPU_CPU_RAM)
			
 
				+		{
			
 
				+			struct starpu_codelet cl =
			
 
				+			{
			
 
				+				.where = STARPU_CPU,
			
 
				+				.cpu_funcs = { _starpu_empty_codelet_function, NULL },
			
 
				+				.modes = { STARPU_RW },
			
 
				+				.nbuffers = 1
			
 
				+			};
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+			task->handles[0] = child_handle;
			
 
				+			task->cl = &cl;
			
 
				+			task->synchronous = 1;
			
 
				+			if (starpu_task_submit(task) != 0)
			
 
				+				_STARPU_ERROR("Could not submit the conversion task while unpartitionning\n");
			
 
				+		}
			
 
				+
			
 
				 		int ret;
			
 
				-		ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, NULL, NULL);
			
 
				-		/* for now we pretend that the RAM is almost unlimited and that gathering 
			
 
				+		ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, 0, NULL, NULL);
			
 
				+		/* for now we pretend that the RAM is almost unlimited and that gathering
			
 
				 		 * data should be possible from the node that does the unpartionning ... we
			
 
				 		 * don't want to have the programming deal with memory shortage at that time,
			
 
				 		 * really */
			
 
				-		STARPU_ASSERT(ret == 0); 
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				+
			
 
				+		_starpu_spin_lock(&child_handle->header_lock);
			
 
				 
			
 
				 		_starpu_data_free_interfaces(&root_handle->children[child]);
			
 
				-		starpu_data_requester_list_delete(child_handle->req_list);
			
 
				-		starpu_data_requester_list_delete(child_handle->reduction_req_list);
			
 
				+		_starpu_data_requester_list_delete(child_handle->req_list);
			
 
				+		_starpu_data_requester_list_delete(child_handle->reduction_req_list);
			
 
				 	}
			
 
				 
			
 
				 	/* the gathering_node should now have a valid copy of all the children.
			
 
				 	 * For all nodes, if the node had all copies and none was locally
			
 
				 	 * allocated then the data is still valid there, else, it's invalidated
			
 
				-	 * for the gathering node, if we have some locally allocated data, we 
			
 
				+	 * for the gathering node, if we have some locally allocated data, we
			
 
				 	 * copy all the children (XXX this should not happen so we just do not
			
 
				 	 * do anything since this is transparent ?) */
			
 
				 	unsigned still_valid[STARPU_MAXNODES];
			
@@ -293,11 +343,12 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 
				 
			
 
				 		for (child = 0; child < root_handle->nchildren; child++)
			
 
				 		{
			
 
				-			struct starpu_data_replicate_s *local = &root_handle->children[child].per_node[node];
			
 
				+			struct _starpu_data_replicate *local = &root_handle->children[child].per_node[node];
			
 
				 
			
 
				-			if (local->state == STARPU_INVALID) {
			
 
				+			if (local->state == STARPU_INVALID)
			
 
				+			{
			
 
				 				/* One of the bits is missing */
			
 
				-				isvalid = 0; 
			
 
				+				isvalid = 0;
			
 
				 			}
			
 
				 
			
 
				 			if (local->allocated && local->automatically_allocated)
			
@@ -323,16 +374,17 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 
				 	/* either shared or owned */
			
 
				 	STARPU_ASSERT(nvalids > 0);
			
 
				 
			
 
				-	starpu_cache_state newstate = (nvalids == 1)?STARPU_OWNER:STARPU_SHARED;
			
 
				+	enum _starpu_cache_state newstate = (nvalids == 1)?STARPU_OWNER:STARPU_SHARED;
			
 
				 
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		root_handle->per_node[node].state = 
			
 
				+		root_handle->per_node[node].state =
			
 
				 			still_valid[node]?newstate:STARPU_INVALID;
			
 
				 	}
			
 
				 
			
 
				 	/* there is no child anymore */
			
 
				-	//free(root_handle->children);
			
 
				+	free(root_handle->children);
			
 
				+	root_handle->children = NULL;
			
 
				 	root_handle->nchildren = 0;
			
 
				 
			
 
				 	/* now the parent may be used again so we release the lock */
			
@@ -340,9 +392,9 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 
				 }
			
 
				 
			
 
				 /* each child may have his own interface type */
			
 
				-static void starpu_data_create_children(starpu_data_handle handle, unsigned nchildren, struct starpu_data_filter *f)
			
 
				+static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f)
			
 
				 {
			
 
				-	handle->children = (struct starpu_data_state_t *) calloc(nchildren, sizeof(struct starpu_data_state_t));
			
 
				+	handle->children = (struct _starpu_data_state *) calloc(nchildren, sizeof(struct _starpu_data_state));
			
 
				 	STARPU_ASSERT(handle->children);
			
 
				 
			
 
				 	unsigned node;
			
@@ -353,16 +405,16 @@ static void starpu_data_create_children(starpu_data_handle handle, unsigned nchi
 
				 
			
 
				 	for (child = 0; child < nchildren; child++)
			
 
				 	{
			
 
				-		starpu_data_handle handle_child = &handle->children[child];
			
 
				-		
			
 
				-		struct starpu_data_interface_ops_t *ops;
			
 
				-		
			
 
				+		starpu_data_handle_t handle_child = &handle->children[child];
			
 
				+
			
 
				+		struct starpu_data_interface_ops *ops;
			
 
				+
			
 
				 		/* what's this child's interface ? */
			
 
				 		if (f->get_child_ops)
			
 
				 		  ops = f->get_child_ops(f, child);
			
 
				 		else
			
 
				 		  ops = handle->ops;
			
 
				-		
			
 
				+
			
 
				 		handle_child->ops = ops;
			
 
				 
			
 
				 		size_t interfacesize = ops->interface_size;
			
@@ -381,8 +433,10 @@ static void starpu_data_create_children(starpu_data_handle handle, unsigned nchi
 
				 			handle_child->per_worker[worker].data_interface = calloc(1, interfacesize);
			
 
				 			STARPU_ASSERT(handle_child->per_worker[worker].data_interface);
			
 
				 		}
			
 
				+
			
 
				+		handle_child->mf_node = handle->mf_node;
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	/* this handle now has children */
			
 
				 	handle->nchildren = nchildren;
			
 
				 }
			
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,9 +16,9 @@
 
				  */
			
 
				 
			
 
				 #include <datawizard/footprint.h>
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 
			
 
				-uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
			
 
				+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
			
 
				 {
			
 
				 	if (j->footprint_is_computed)
			
 
				 		return j->footprint;
			
@@ -28,13 +28,21 @@ uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
 
				 
			
 
				 	struct starpu_task *task = j->task;
			
 
				 
			
 
				-	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
			
 
				-	{
			
 
				-		starpu_data_handle handle = task->buffers[buffer].handle;
			
 
				-
			
 
				-		uint32_t handle_footprint = _starpu_data_get_footprint(handle);
			
 
				-
			
 
				-		footprint = _starpu_crc32_be(handle_footprint, footprint);
			
 
				+	if (model && model->per_arch[arch][nimpl].size_base) {
			
 
				+		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
			
 
				+		footprint = starpu_crc32_be_n(&size, sizeof(size), footprint);
			
 
				+	} else if (model && model->size_base) {
			
 
				+		size_t size = model->size_base(task, nimpl);
			
 
				+		footprint = starpu_crc32_be_n(&size, sizeof(size), footprint);
			
 
				+	} else {
			
 
				+		for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
			
 
				+		{
			
 
				+			starpu_data_handle_t handle = task->handles[buffer];
			
 
				+
			
 
				+			uint32_t handle_footprint = _starpu_data_get_footprint(handle);
			
 
				+
			
 
				+			footprint = starpu_crc32_be(handle_footprint, footprint);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	j->footprint = footprint;
			
@@ -43,11 +51,13 @@ uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
 
				 	return footprint;
			
 
				 }
			
 
				 
			
 
				-uint32_t _starpu_compute_data_footprint(starpu_data_handle handle)
			
 
				+uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	uint32_t interfaceid = (uint32_t)starpu_get_handle_interface_id(handle);
			
 
				+	uint32_t interfaceid = (uint32_t)starpu_handle_get_interface_id(handle);
			
 
				+
			
 
				+	STARPU_ASSERT(handle->ops->footprint);
			
 
				 
			
 
				 	uint32_t handle_footprint = handle->ops->footprint(handle);
			
 
				 
			
 
				-	return _starpu_crc32_be(handle_footprint, interfaceid);
			
 
				+	return starpu_crc32_be(handle_footprint, interfaceid);
			
 
				 }
			
--- a/src/datawizard/footprint.h
+++ b/src/datawizard/footprint.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,9 +24,9 @@
 
				 
			
 
				 /* Compute the footprint that characterizes the job and cache it into the job
			
 
				  * structure. */
			
 
				-uint32_t _starpu_compute_buffers_footprint(struct starpu_job_s *j);
			
 
				+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
			
 
				 
			
 
				 /* Compute the footprint that characterizes the layout of the data handle. */
			
 
				-uint32_t _starpu_compute_data_footprint(starpu_data_handle handle);
			
 
				+uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);
			
 
				 
			
 
				 #endif // __FOOTPRINT_H__
			
--- a/src/datawizard/interfaces/bcsr_filters.c
+++ b/src/datawizard/interfaces/bcsr_filters.c
@@ -22,17 +22,17 @@
 
				 
			
 
				 void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nparts)
			
 
				 {
			
 
				-	struct starpu_bcsr_interface_s *bcsr_father = (struct starpu_bcsr_interface_s *) father_interface;
			
 
				+	struct starpu_bcsr_interface *bcsr_father = (struct starpu_bcsr_interface *) father_interface;
			
 
				 	/* each chunk becomes a small dense matrix */
			
 
				-	starpu_matrix_interface_t *matrix_child = (starpu_matrix_interface_t *) child_interface;
			
 
				-	
			
 
				+	struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
			
 
				+
			
 
				 	size_t elemsize = bcsr_father->elemsize;
			
 
				 	uint32_t firstentry = bcsr_father->firstentry;
			
 
				 
			
 
				 	/* size of the tiles */
			
 
				 	uint32_t r = bcsr_father->r;
			
 
				 	uint32_t c = bcsr_father->c;
			
 
				-	
			
 
				+
			
 
				 	uint32_t ptr_offset = c*r*id*elemsize;
			
 
				 
			
 
				 	matrix_child->nx = c;
			
@@ -40,8 +40,9 @@ void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_inte
 
				 	matrix_child->ld = c;
			
 
				 	matrix_child->elemsize = elemsize;
			
 
				 
			
 
				-	if (bcsr_father->nzval) {
			
 
				-	  uint8_t *nzval = (uint8_t *)(bcsr_father->nzval);
			
 
				-	  matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
			
 
				+	if (bcsr_father->nzval)
			
 
				+	{
			
 
				+		uint8_t *nzval = (uint8_t *)(bcsr_father->nzval);
			
 
				+		matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
			
 
				 	}
			
 
				 }
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -21,7 +21,7 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/filters.h>
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
				 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #endif
			
 
				 
			
 
				-static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
			
 
				+static const struct starpu_data_copy_methods bcsr_copy_data_methods_s =
			
 
				+{
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 	.ram_to_spu = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -60,42 +61,45 @@ static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static void register_bcsr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				 static ssize_t allocate_bcsr_buffer_on_node(void *data_interface, uint32_t dst_node);
			
 
				 static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node);
			
 
				-static size_t bcsr_interface_get_size(starpu_data_handle handle);
			
 
				+static size_t bcsr_interface_get_size(starpu_data_handle_t handle);
			
 
				 static int bcsr_compare(void *data_interface_a, void *data_interface_b);
			
 
				-static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle);
			
 
				+static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle);
			
 
				 
			
 
				 
			
 
				-static struct starpu_data_interface_ops_t interface_bcsr_ops = {
			
 
				+static struct starpu_data_interface_ops interface_bcsr_ops =
			
 
				+{
			
 
				 	.register_data_handle = register_bcsr_handle,
			
 
				 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
			
 
				 	.free_data_on_node = free_bcsr_buffer_on_node,
			
 
				 	.copy_methods = &bcsr_copy_data_methods_s,
			
 
				 	.get_size = bcsr_interface_get_size,
			
 
				 	.interfaceid = STARPU_BCSR_INTERFACE_ID,
			
 
				-	.interface_size = sizeof(starpu_bcsr_interface_t),
			
 
				+	.interface_size = sizeof(struct starpu_bcsr_interface),
			
 
				 	.footprint = footprint_bcsr_interface_crc32,
			
 
				 	.compare = bcsr_compare
			
 
				 };
			
 
				 
			
 
				-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+static void register_bcsr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *bcsr_interface = (starpu_bcsr_interface_t *) data_interface;
			
 
				+	struct starpu_bcsr_interface *bcsr_interface = (struct starpu_bcsr_interface *) data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		starpu_bcsr_interface_t *local_interface = (starpu_bcsr_interface_t *)
			
 
				+		struct starpu_bcsr_interface *local_interface = (struct starpu_bcsr_interface *)
			
 
				 			starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-		if (node == home_node) {
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				 			local_interface->nzval = bcsr_interface->nzval;
			
 
				 			local_interface->colind = bcsr_interface->colind;
			
 
				 			local_interface->rowptr = bcsr_interface->rowptr;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			local_interface->nzval = 0;
			
 
				 			local_interface->colind = NULL;
			
 
				 			local_interface->rowptr = NULL;
			
@@ -110,12 +114,13 @@ static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				+void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
			
 
				 		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind,
			
 
				 		uint32_t *rowptr, uint32_t firstentry,
			
 
				 		uint32_t r, uint32_t c, size_t elemsize)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t bcsr_interface = {
			
 
				+	struct starpu_bcsr_interface bcsr_interface =
			
 
				+	{
			
 
				 		.nzval = nzval,
			
 
				 		.colind = colind,
			
 
				 		.rowptr = rowptr,
			
@@ -130,21 +135,21 @@ void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node
 
				 	starpu_data_register(handleptr, home_node, &bcsr_interface, &interface_bcsr_ops);
			
 
				 }
			
 
				 
			
 
				-static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle)
			
 
				+static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	uint32_t hash;
			
 
				 
			
 
				-	hash = _starpu_crc32_be(starpu_bcsr_get_nnz(handle), 0);
			
 
				-	hash = _starpu_crc32_be(starpu_bcsr_get_c(handle), hash);
			
 
				-	hash = _starpu_crc32_be(starpu_bcsr_get_r(handle), hash);
			
 
				+	hash = starpu_crc32_be(starpu_bcsr_get_nnz(handle), 0);
			
 
				+	hash = starpu_crc32_be(starpu_bcsr_get_c(handle), hash);
			
 
				+	hash = starpu_crc32_be(starpu_bcsr_get_r(handle), hash);
			
 
				 
			
 
				 	return hash;
			
 
				 }
			
 
				 
			
 
				 static int bcsr_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *bcsr_a = (starpu_bcsr_interface_t *) data_interface_a;
			
 
				-	starpu_bcsr_interface_t *bcsr_b = (starpu_bcsr_interface_t *) data_interface_b;
			
 
				+	struct starpu_bcsr_interface *bcsr_a = (struct starpu_bcsr_interface *) data_interface_a;
			
 
				+	struct starpu_bcsr_interface *bcsr_b = (struct starpu_bcsr_interface *) data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((bcsr_a->nnz == bcsr_b->nnz)
			
@@ -155,87 +160,87 @@ static int bcsr_compare(void *data_interface_a, void *data_interface_b)
 
				 }
			
 
				 
			
 
				 /* offer an access to the data parameters */
			
 
				-uint32_t starpu_bcsr_get_nnz(starpu_data_handle handle)
			
 
				+uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->nnz;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_bcsr_get_nrow(starpu_data_handle handle)
			
 
				+uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->nrow;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_bcsr_get_firstentry(starpu_data_handle handle)
			
 
				+uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->firstentry;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_bcsr_get_r(starpu_data_handle handle)
			
 
				+uint32_t starpu_bcsr_get_r(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->r;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_bcsr_get_c(starpu_data_handle handle)
			
 
				+uint32_t starpu_bcsr_get_c(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->c;
			
 
				 }
			
 
				 
			
 
				-size_t starpu_bcsr_get_elemsize(starpu_data_handle handle)
			
 
				+size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				-uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle handle)
			
 
				+uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				-	
			
 
				+
			
 
				 	return data_interface->nzval;
			
 
				 }
			
 
				 
			
 
				-uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle handle)
			
 
				+uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	/* XXX 0 */
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->colind;
			
 
				 }
			
 
				 
			
 
				-uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle handle)
			
 
				+uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	/* XXX 0 */
			
 
				-	starpu_bcsr_interface_t *data_interface = (starpu_bcsr_interface_t *)
			
 
				+	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return data_interface->rowptr;
			
 
				 }
			
 
				 
			
 
				 
			
 
				-static size_t bcsr_interface_get_size(starpu_data_handle handle)
			
 
				+static size_t bcsr_interface_get_size(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	size_t size;
			
 
				 
			
@@ -245,7 +250,7 @@ static size_t bcsr_interface_get_size(starpu_data_handle handle)
 
				 	uint32_t c = starpu_bcsr_get_c(handle);
			
 
				 	size_t elemsize = starpu_bcsr_get_elemsize(handle);
			
 
				 
			
 
				-	size = nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t); 
			
 
				+	size = nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				 
			
 
				 	return size;
			
 
				 }
			
@@ -261,7 +266,7 @@ static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				 	/* we need the 3 arrays to be allocated */
			
 
				-	starpu_bcsr_interface_t *bcsr_interface = (starpu_bcsr_interface_t *) data_interface_;
			
 
				+	struct starpu_bcsr_interface *bcsr_interface = (struct starpu_bcsr_interface *) data_interface_;
			
 
				 
			
 
				 	uint32_t nnz = bcsr_interface->nnz;
			
 
				 	uint32_t nrow = bcsr_interface->nrow;
			
@@ -270,9 +275,10 @@ static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_
 
				 	uint32_t r = bcsr_interface->r;
			
 
				 	uint32_t c = bcsr_interface->c;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			addr_nzval = (uintptr_t)malloc(nnz*r*c*elemsize);
			
 
				 			if (!addr_nzval)
			
@@ -305,42 +311,43 @@ static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 		case STARPU_OPENCL_RAM:
			
 
				-                        {
			
 
				-                                int ret;
			
 
				-                                void *ptr;
			
 
				+		{
			
 
				+			int ret;
			
 
				+			cl_mem ptr;
			
 
				 
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
			
 
				-                                addr_nzval = (uintptr_t)ptr;
			
 
				-                                if (ret) goto fail_nzval;
			
 
				+			ret = starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
			
 
				+			addr_nzval = (uintptr_t)ptr;
			
 
				+			if (ret) goto fail_nzval;
			
 
				 
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				-                                addr_colind = ptr;
			
 
				-				if (ret) goto fail_colind;
			
 
				+			ret = starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+			addr_colind = (void*) ptr;
			
 
				+			if (ret) goto fail_colind;
			
 
				 
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				-                                addr_rowptr = ptr;
			
 
				-				if (ret) goto fail_rowptr;
			
 
				+			ret = starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+			addr_rowptr = (void*) ptr;
			
 
				+			if (ret) goto fail_rowptr;
			
 
				 
			
 
				-                                break;
			
 
				-                        }
			
 
				+			break;
			
 
				+		}
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 	/* allocation succeeded */
			
 
				-	allocated_memory = 
			
 
				+	allocated_memory =
			
 
				 		nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				 	bcsr_interface->nzval = addr_nzval;
			
 
				 	bcsr_interface->colind = addr_colind;
			
 
				 	bcsr_interface->rowptr = addr_rowptr;
			
 
				-	
			
 
				+
			
 
				 	return allocated_memory;
			
 
				 
			
 
				 fail_rowptr:
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_colind);
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -354,11 +361,12 @@ fail_rowptr:
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 fail_colind:
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_nzval);
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -372,7 +380,7 @@ fail_colind:
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 fail_nzval:
			
@@ -383,10 +391,11 @@ fail_nzval:
 
				 
			
 
				 static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *bcsr_interface = (starpu_bcsr_interface_t *) data_interface;
			
 
				+	struct starpu_bcsr_interface *bcsr_interface = (struct starpu_bcsr_interface *) data_interface;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				-	switch(kind) {
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(node);
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void*)bcsr_interface->nzval);
			
 
				 			free((void*)bcsr_interface->colind);
			
@@ -407,15 +416,15 @@ static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node)
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
 
				+	struct starpu_bcsr_interface *src_bcsr = src_interface;
			
 
				+	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_bcsr->nnz;
			
 
				 	uint32_t nrow = src_bcsr->nrow;
			
@@ -438,7 +447,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -462,8 +471,8 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
 
				+	struct starpu_bcsr_interface *src_bcsr = src_interface;
			
 
				+	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_bcsr->nnz;
			
 
				 	uint32_t nrow = src_bcsr->nrow;
			
@@ -474,27 +483,27 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, (void *)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, src_node, (void *)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, (void *)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, src_node, (void *)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, (void *)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, src_node, (void *)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
 
				+	struct starpu_bcsr_interface *src_bcsr = src_interface;
			
 
				+	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_bcsr->nnz;
			
 
				 	uint32_t nrow = src_bcsr->nrow;
			
@@ -505,19 +514,19 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, (cl_mem)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, src_node, (cl_mem)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, (cl_mem)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, src_node, (cl_mem)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, (cl_mem)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, src_node, (cl_mem)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -526,8 +535,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *src_bcsr = (starpu_bcsr_interface_t *) src_interface;
			
 
				-	starpu_bcsr_interface_t *dst_bcsr = (starpu_bcsr_interface_t *) dst_interface;
			
 
				+	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
			
 
				+	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_bcsr->nnz;
			
 
				 	uint32_t nrow = src_bcsr->nrow;
			
@@ -542,7 +551,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 
			
 
				 	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/block_filters.c
+++ b/src/datawizard/interfaces/block_filters.c
@@ -21,8 +21,8 @@
 
				 void starpu_block_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
			
 
				                                     unsigned id, unsigned nparts)
			
 
				 {
			
 
				-        starpu_block_interface_t *block_father = (starpu_block_interface_t *) father_interface;
			
 
				-        starpu_block_interface_t *block_child = (starpu_block_interface_t *) child_interface;
			
 
				+        struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
			
 
				+        struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
			
 
				 
			
 
				 	uint32_t nx = block_father->nx;
			
 
				         uint32_t ny = block_father->ny;
			
@@ -41,7 +41,8 @@ void starpu_block_filter_func_block(void *father_interface, void *child_interfac
 
				 	block_child->nz = nz;
			
 
				 	block_child->elemsize = elemsize;
			
 
				 
			
 
				-	if (block_father->ptr) {
			
 
				+	if (block_father->ptr)
			
 
				+	{
			
 
				                 block_child->ptr = block_father->ptr + offset;
			
 
				                 block_child->ldy = block_father->ldy;
			
 
				                 block_child->ldz = block_father->ldz;
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,7 +21,7 @@
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/filters.h>
			
 
				 
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				 #endif
			
 
				 
			
 
				-static const struct starpu_data_copy_methods block_copy_data_methods_s = {
			
 
				+static const struct starpu_data_copy_methods block_copy_data_methods_s =
			
 
				+{
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 	.ram_to_spu = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -65,19 +66,20 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
 
				 };
			
 
				 
			
 
				 
			
 
				-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				-static void *block_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static void register_block_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				+static void *block_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
			
 
				 static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				 static void free_block_buffer_on_node(void *data_interface, uint32_t node);
			
 
				-static size_t block_interface_get_size(starpu_data_handle handle);
			
 
				-static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
			
 
				+static size_t block_interface_get_size(starpu_data_handle_t handle);
			
 
				+static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle);
			
 
				 static int block_compare(void *data_interface_a, void *data_interface_b);
			
 
				-static void display_block_interface(starpu_data_handle handle, FILE *f);
			
 
				+static void display_block_interface(starpu_data_handle_t handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 static int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_data_interface_ops_t interface_block_ops = {
			
 
				+static struct starpu_data_interface_ops interface_block_ops =
			
 
				+{
			
 
				 	.register_data_handle = register_block_handle,
			
 
				 	.allocate_data_on_node = allocate_block_buffer_on_node,
			
 
				 	.handle_to_pointer = block_handle_to_pointer,
			
@@ -89,13 +91,13 @@ static struct starpu_data_interface_ops_t interface_block_ops = {
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	.convert_to_gordon = convert_block_to_gordon,
			
 
				 #endif
			
 
				-	.interfaceid = STARPU_BLOCK_INTERFACE_ID, 
			
 
				-	.interface_size = sizeof(starpu_block_interface_t),
			
 
				-	.display = display_block_interface
			
 
				+	.interfaceid = STARPU_BLOCK_INTERFACE_ID,
			
 
				+	.interface_size = sizeof(struct starpu_block_interface),
			
 
				+	.display = display_block_interface,
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
			
 
				 {
			
 
				 	/* TODO */
			
 
				 	STARPU_ABORT();
			
@@ -104,34 +106,36 @@ int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSi
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void *block_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+static void *block_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return (void*) block_interface->ptr;
			
 
				 }
			
 
				 
			
 
				-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+static void register_block_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *) data_interface;
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *) data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		starpu_block_interface_t *local_interface = (starpu_block_interface_t *)
			
 
				+		struct starpu_block_interface *local_interface = (struct starpu_block_interface *)
			
 
				 			starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-		if (node == home_node) {
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				 			local_interface->ptr = block_interface->ptr;
			
 
				                         local_interface->dev_handle = block_interface->dev_handle;
			
 
				                         local_interface->offset = block_interface->offset;
			
 
				 			local_interface->ldy  = block_interface->ldy;
			
 
				 			local_interface->ldz  = block_interface->ldz;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			local_interface->ptr = 0;
			
 
				                         local_interface->dev_handle = 0;
			
 
				                         local_interface->offset = 0;
			
@@ -147,11 +151,12 @@ static void register_block_handle(starpu_data_handle handle, uint32_t home_node,
 
				 }
			
 
				 
			
 
				 /* declare a new data with the BLAS interface */
			
 
				-void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				+void starpu_block_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
			
 
				 			uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
			
 
				 			uint32_t ny, uint32_t nz, size_t elemsize)
			
 
				 {
			
 
				-	starpu_block_interface_t block_interface = {
			
 
				+	struct starpu_block_interface block_interface =
			
 
				+	{
			
 
				 		.ptr = ptr,
			
 
				                 .dev_handle = ptr,
			
 
				                 .offset = 0,
			
@@ -166,21 +171,21 @@ void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_nod
 
				 	starpu_data_register(handleptr, home_node, &block_interface, &interface_block_ops);
			
 
				 }
			
 
				 
			
 
				-static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
			
 
				+static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	uint32_t hash;
			
 
				 
			
 
				-	hash = _starpu_crc32_be(starpu_block_get_nx(handle), 0);
			
 
				-	hash = _starpu_crc32_be(starpu_block_get_ny(handle), hash);
			
 
				-	hash = _starpu_crc32_be(starpu_block_get_nz(handle), hash);
			
 
				+	hash = starpu_crc32_be(starpu_block_get_nx(handle), 0);
			
 
				+	hash = starpu_crc32_be(starpu_block_get_ny(handle), hash);
			
 
				+	hash = starpu_crc32_be(starpu_block_get_nz(handle), hash);
			
 
				 
			
 
				 	return hash;
			
 
				 }
			
 
				 
			
 
				 static int block_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_a = (starpu_block_interface_t *) data_interface_a;
			
 
				-	starpu_block_interface_t *block_b = (starpu_block_interface_t *) data_interface_b;
			
 
				+	struct starpu_block_interface *block_a = (struct starpu_block_interface *) data_interface_a;
			
 
				+	struct starpu_block_interface *block_b = (struct starpu_block_interface *) data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((block_a->nx == block_b->nx)
			
@@ -189,94 +194,94 @@ static int block_compare(void *data_interface_a, void *data_interface_b)
 
				 			&& (block_a->elemsize == block_b->elemsize));
			
 
				 }
			
 
				 
			
 
				-static void display_block_interface(starpu_data_handle handle, FILE *f)
			
 
				+static void display_block_interface(starpu_data_handle_t handle, FILE *f)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface;
			
 
				+	struct starpu_block_interface *block_interface;
			
 
				 
			
 
				-	block_interface = (starpu_block_interface_t *) starpu_data_get_interface_on_node(handle, 0);
			
 
				+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
			
 
				 }
			
 
				 
			
 
				-static size_t block_interface_get_size(starpu_data_handle handle)
			
 
				+static size_t block_interface_get_size(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	size_t size;
			
 
				-	starpu_block_interface_t *block_interface;
			
 
				+	struct starpu_block_interface *block_interface;
			
 
				 
			
 
				-	block_interface = (starpu_block_interface_t *) starpu_data_get_interface_on_node(handle, 0);
			
 
				+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize; 
			
 
				+	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize;
			
 
				 
			
 
				 	return size;
			
 
				 }
			
 
				 
			
 
				 /* offer an access to the data parameters */
			
 
				-uint32_t starpu_block_get_nx(starpu_data_handle handle)
			
 
				+uint32_t starpu_block_get_nx(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return block_interface->nx;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_block_get_ny(starpu_data_handle handle)
			
 
				+uint32_t starpu_block_get_ny(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return block_interface->ny;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_block_get_nz(starpu_data_handle handle)
			
 
				+uint32_t starpu_block_get_nz(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return block_interface->nz;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
			
 
				+uint32_t starpu_block_get_local_ldy(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-	
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return block_interface->ldy;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
			
 
				+uint32_t starpu_block_get_local_ldz(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return block_interface->ldz;
			
 
				 }
			
 
				 
			
 
				-uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
			
 
				+uintptr_t starpu_block_get_local_ptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return block_interface->ptr;
			
 
				 }
			
 
				 
			
 
				-size_t starpu_block_get_elemsize(starpu_data_handle handle)
			
 
				+size_t starpu_block_get_elemsize(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *)
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return block_interface->elemsize;
			
@@ -288,26 +293,27 @@ size_t starpu_block_get_elemsize(starpu_data_handle handle)
 
				 /* returns the size of the allocated area */
			
 
				 static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				-	uintptr_t addr = 0;
			
 
				+	uintptr_t addr = 0, handle = 0;
			
 
				 	unsigned fail = 0;
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				-	starpu_block_interface_t *dst_block = (starpu_block_interface_t *) data_interface_;
			
 
				+	struct starpu_block_interface *dst_block = (struct starpu_block_interface *) data_interface_;
			
 
				 
			
 
				 	uint32_t nx = dst_block->nx;
			
 
				 	uint32_t ny = dst_block->ny;
			
 
				 	uint32_t nz = dst_block->nz;
			
 
				 	size_t elemsize = dst_block->elemsize;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				-			addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
			
 
				-			if (!addr) 
			
 
				+			handle = addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
			
 
				+			if (!addr)
			
 
				 				fail = 1;
			
 
				 
			
 
				 			break;
			
@@ -324,6 +330,7 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 
				 
			
 
				 				fail = 1;
			
 
				 			}
			
 
				+			handle = addr;
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -331,47 +338,52 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 
				 	        case STARPU_OPENCL_RAM:
			
 
				 			{
			
 
				                                 int ret;
			
 
				-                                void *ptr;
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
			
 
				-                                addr = (uintptr_t)ptr;
			
 
				-				if (ret) {
			
 
				+				cl_mem mem;
			
 
				+                                ret = starpu_opencl_allocate_memory(&mem, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
			
 
				+				handle = (uintptr_t)mem;
			
 
				+				if (ret)
			
 
				+				{
			
 
				 					fail = 1;
			
 
				 				}
			
 
				 				break;
			
 
				 			}
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				-	if (!fail) {
			
 
				+	if (!fail)
			
 
				+	{
			
 
				 		/* allocation succeeded */
			
 
				 		allocated_memory = nx*ny*nz*elemsize;
			
 
				 
			
 
				 		/* update the data properly in consequence */
			
 
				 		dst_block->ptr = addr;
			
 
				-                dst_block->dev_handle = addr;
			
 
				+		dst_block->dev_handle = handle;
			
 
				                 dst_block->offset = 0;
			
 
				 		dst_block->ldy = nx;
			
 
				 		dst_block->ldz = nx*ny;
			
 
				-	} else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		/* allocation failed */
			
 
				 		allocated_memory = -ENOMEM;
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				 static void free_block_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = (starpu_block_interface_t *) data_interface;
			
 
				+	struct starpu_block_interface *block_interface = (struct starpu_block_interface *) data_interface;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				-	switch(kind) {
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(node);
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void*)block_interface->ptr);
			
 
				 			break;
			
@@ -385,19 +397,19 @@ static void free_block_buffer_on_node(void *data_interface, uint32_t node)
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                 case STARPU_OPENCL_RAM:
			
 
				-                        clReleaseMemObject((void *)block_interface->ptr);
			
 
				+			clReleaseMemObject((void *)block_interface->dev_handle);
			
 
				                         break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_block_interface_t *src_block = src_interface;
			
 
				-	starpu_block_interface_t *dst_block = dst_interface;
			
 
				+	struct starpu_block_interface *src_block = src_interface;
			
 
				+	struct starpu_block_interface *dst_block = dst_interface;
			
 
				 
			
 
				 	uint32_t nx = src_block->nx;
			
 
				 	uint32_t ny = src_block->ny;
			
@@ -416,7 +428,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				                         if (STARPU_UNLIKELY(cures))
			
 
				                                 STARPU_CUDA_REPORT_ERROR(cures);
			
 
				                 }
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* Are all plans contiguous */
			
 
				                         cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				                                              (char *)src_block->ptr, src_block->ldz*elemsize,
			
@@ -425,7 +438,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				                                 STARPU_CUDA_REPORT_ERROR(cures);
			
 
				                 }
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				 		unsigned layer;
			
 
				 		for (layer = 0; layer < src_block->nz; layer++)
			
@@ -442,15 +456,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_block_interface_t *src_block = src_interface;
			
 
				-	starpu_block_interface_t *dst_block = dst_interface;
			
 
				+	struct starpu_block_interface *src_block = src_interface;
			
 
				+	struct starpu_block_interface *dst_block = dst_interface;
			
 
				 
			
 
				 	uint32_t nx = src_block->nx;
			
 
				 	uint32_t ny = src_block->ny;
			
@@ -468,8 +482,10 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 		/* Is that a single contiguous buffer ? */
			
 
				 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				+			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				 					nx*ny*nz*elemsize, kind, stream);
			
 
				+			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
 
				 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
			
@@ -479,16 +495,20 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 
			
 
				 				ret = 0;
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				ret = -EAGAIN;
			
 
				 			}
			
 
				-			
			
 
				+
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* Are all plans contiguous */
			
 
				+			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				 					(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				 					nx*ny*elemsize, nz, kind, stream);
			
 
				+			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
 
				 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
@@ -499,12 +519,14 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 
			
 
				 				ret = 0;
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				ret = -EAGAIN;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				 		unsigned layer;
			
 
				 		for (layer = 0; layer < src_block->nz; layer++)
			
@@ -512,9 +534,11 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
			
 
				 			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
			
 
				 
			
 
				+			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				                                                   (char *)src_ptr, src_block->ldy*elemsize,
			
 
				                                                   nx*elemsize, ny, kind, stream);
			
 
				+			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
@@ -528,7 +552,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 
			
@@ -549,7 +573,7 @@ no_async_default:
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				 	return 0;
			
 
				 	}
			
 
				 }
			
@@ -583,9 +607,9 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_block_interface_t *src_block = src_interface;
			
 
				-	starpu_block_interface_t *dst_block = dst_interface;
			
 
				-        int err,ret;
			
 
				+	struct starpu_block_interface *src_block = src_interface;
			
 
				+	struct starpu_block_interface *dst_block = dst_interface;
			
 
				+        int err, ret = 0;
			
 
				 
			
 
				 	uint32_t nx = src_block->nx;
			
 
				 	uint32_t ny = src_block->ny;
			
@@ -597,27 +621,30 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 		/* Is that a single contiguous buffer ? */
			
 
				 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				-                        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
			
 
				+                        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
			
 
				                                                                            src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				                                                                            dst_block->offset, (cl_event*)_event, &ret);
			
 
				                         if (STARPU_UNLIKELY(err))
			
 
				                                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 }
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* Are all plans contiguous */
			
 
				                         /* XXX non contiguous buffers are not properly supported yet. (TODO) */
			
 
				                         STARPU_ASSERT(0);
			
 
				                 }
			
 
				         }
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				 		unsigned layer;
			
 
				 		for (layer = 0; layer < src_block->nz; layer++)
			
 
				 		{
			
 
				                         unsigned j;
			
 
				-                        for(j=0 ; j<src_block->ny ; j++) {
			
 
				+                        for(j=0 ; j<src_block->ny ; j++)
			
 
				+			{
			
 
				                                 void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
			
 
				-                                err = _starpu_opencl_copy_ram_to_opencl(ptr, (cl_mem)dst_block->dev_handle,
			
 
				+                                err = starpu_opencl_copy_ram_to_opencl(ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
			
 
				                                                                         src_block->nx*src_block->elemsize,
			
 
				                                                                         layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
			
 
				                                                                         + dst_block->offset, NULL);
			
@@ -636,23 +663,23 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				                         //                        size_t host_row_pitch=region[0];
			
 
				                         //                        size_t host_slice_pitch=region[1] * host_row_pitch;
			
 
				                         //
			
 
				-                        //                        _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, (cl_mem)dst_block->dev_handle,
			
 
				+                        //                        _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
			
 
				                         //                                                               buffer_origin, host_origin, region,
			
 
				                         //                                                               buffer_row_pitch, buffer_slice_pitch,
			
 
				                         //                                                               host_row_pitch, host_slice_pitch, NULL);
			
 
				                 }
			
 
				         }
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_block_interface_t *src_block = src_interface;
			
 
				-	starpu_block_interface_t *dst_block = dst_interface;
			
 
				-        int err, ret;
			
 
				+	struct starpu_block_interface *src_block = src_interface;
			
 
				+	struct starpu_block_interface *dst_block = dst_interface;
			
 
				+        int err, ret = 0;
			
 
				 
			
 
				 	/* We may have a contiguous buffer for the entire block, or contiguous
			
 
				 	 * plans within the block, we can avoid many small transfers that way */
			
@@ -661,31 +688,36 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 		/* Is that a single contiguous buffer ? */
			
 
				 		if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				-                        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
			
 
				+                        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
			
 
				                                                                            src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				                                                                            src_block->offset, (cl_event*)_event, &ret);
			
 
				                         if (STARPU_UNLIKELY(err))
			
 
				                                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 }
			
 
				-                else {
			
 
				+                else
			
 
				+		{
			
 
				 			/* Are all plans contiguous */
			
 
				                         /* XXX non contiguous buffers are not properly supported yet. (TODO) */
			
 
				                         STARPU_ASSERT(0);
			
 
				                 }
			
 
				         }
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				                 /* XXX non contiguous buffers are not properly supported yet. (TODO) */
			
 
				 		unsigned layer;
			
 
				 		for (layer = 0; layer < src_block->nz; layer++)
			
 
				 		{
			
 
				                         unsigned j;
			
 
				-                        for(j=0 ; j<src_block->ny ; j++) {
			
 
				+                        for(j=0 ; j<src_block->ny ; j++)
			
 
				+			{
			
 
				                                 void *ptr = (void *)dst_block->ptr+(layer*dst_block->ldz*dst_block->elemsize)+(j*dst_block->ldy*dst_block->elemsize);
			
 
				-                                err = _starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, ptr,
			
 
				+                                err = starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, src_node, ptr, dst_node,
			
 
				                                                                         src_block->nx*src_block->elemsize,
			
 
				                                                                         layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
			
 
				                                                                         src_block->offset, NULL);
			
 
				+				if (STARPU_UNLIKELY(err))
			
 
				+					STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                         }
			
 
				                         //                        const size_t buffer_origin[3] = {src_block->offset, 0, 0};
			
 
				                         //                        const size_t host_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
			
@@ -695,14 +727,14 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				                         //                        size_t host_row_pitch=region[0];
			
 
				                         //                        size_t host_slice_pitch=region[1] * host_row_pitch;
			
 
				                         //
			
 
				-                        //                        _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, (void *)dst_block->ptr,
			
 
				+                        //                        _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, src_node, (void *)dst_block->ptr, dst_node,
			
 
				                         //                                                               buffer_origin, host_origin, region,
			
 
				                         //                                                               buffer_row_pitch, buffer_slice_pitch,
			
 
				                         //                                                               host_row_pitch, host_slice_pitch, NULL);
			
 
				                 }
			
 
				         }
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -722,8 +754,8 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_block_interface_t *src_block = (starpu_block_interface_t *) src_interface;
			
 
				-	starpu_block_interface_t *dst_block = (starpu_block_interface_t *) dst_interface;
			
 
				+	struct starpu_block_interface *src_block = (struct starpu_block_interface *) src_interface;
			
 
				+	struct starpu_block_interface *dst_block = (struct starpu_block_interface *) dst_interface;
			
 
				 
			
 
				 	uint32_t nx = dst_block->nx;
			
 
				 	uint32_t ny = dst_block->ny;
			
@@ -740,16 +772,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 
			
 
				 	unsigned y, z;
			
 
				 	for (z = 0; z < nz; z++)
			
 
				-	for (y = 0; y < ny; y++)
			
 
				 	{
			
 
				-		uint32_t src_offset = (y*ldy_src + y*z*ldz_src)*elemsize;
			
 
				-		uint32_t dst_offset = (y*ldy_dst + y*z*ldz_dst)*elemsize;
			
 
				+		for (y = 0; y < ny; y++)
			
 
				+		{
			
 
				+			uint32_t src_offset = (y*ldy_src + z*ldz_src)*elemsize;
			
 
				+			uint32_t dst_offset = (y*ldy_dst + z*ldz_dst)*elemsize;
			
 
				 
			
 
				-		memcpy((void *)(ptr_dst + dst_offset), 
			
 
				-			(void *)(ptr_src + src_offset), nx*elemsize);
			
 
				+			memcpy((void *)(ptr_dst + dst_offset),
			
 
				+				(void *)(ptr_src + src_offset), nx*elemsize);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/csr_filters.c
+++ b/src/datawizard/interfaces/csr_filters.c
@@ -22,8 +22,8 @@
 
				 
			
 
				 void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_father = (starpu_csr_interface_t *) father_interface;
			
 
				-	starpu_csr_interface_t *csr_child = (starpu_csr_interface_t *) child_interface;
			
 
				+	struct starpu_csr_interface *csr_father = (struct starpu_csr_interface *) father_interface;
			
 
				+	struct starpu_csr_interface *csr_child = (struct starpu_csr_interface *) child_interface;
			
 
				 
			
 
				 	uint32_t nrow = csr_father->nrow;
			
 
				 	size_t elemsize = csr_father->elemsize;
			
@@ -35,20 +35,21 @@ void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_i
 
				 
			
 
				 	uint32_t first_index = id*chunk_size - firstentry;
			
 
				 	uint32_t local_firstentry = rowptr[first_index];
			
 
				-	
			
 
				-	uint32_t child_nrow = 
			
 
				+
			
 
				+	uint32_t child_nrow =
			
 
				 	  STARPU_MIN(chunk_size, nrow - id*chunk_size);
			
 
				-	
			
 
				-	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index]; 
			
 
				-	
			
 
				+
			
 
				+	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
			
 
				+
			
 
				 	csr_child->nnz = local_nnz;
			
 
				 	csr_child->nrow = child_nrow;
			
 
				 	csr_child->firstentry = local_firstentry;
			
 
				 	csr_child->elemsize = elemsize;
			
 
				-	
			
 
				-	if (csr_father->nzval) {
			
 
				-	  csr_child->rowptr = &csr_father->rowptr[first_index];
			
 
				-	  csr_child->colind = &csr_father->colind[local_firstentry];
			
 
				-	  csr_child->nzval = csr_father->nzval + local_firstentry * elemsize;
			
 
				+
			
 
				+	if (csr_father->nzval)
			
 
				+	{
			
 
				+		csr_child->rowptr = &csr_father->rowptr[first_index];
			
 
				+		csr_child->colind = &csr_father->colind[local_firstentry];
			
 
				+		csr_child->nzval = csr_father->nzval + local_firstentry * elemsize;
			
 
				 	}
			
 
				 }
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,7 @@
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/filters.h>
			
 
				 
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
 
				 static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 #endif
			
 
				 
			
 
				-static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
			
 
				+static const struct starpu_data_copy_methods csr_copy_data_methods_s =
			
 
				+{
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 	.ram_to_spu = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -63,40 +64,43 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static void register_csr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				 static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				 static void free_csr_buffer_on_node(void *data_interface, uint32_t node);
			
 
				-static size_t csr_interface_get_size(starpu_data_handle handle);
			
 
				+static size_t csr_interface_get_size(starpu_data_handle_t handle);
			
 
				 static int csr_compare(void *data_interface_a, void *data_interface_b);
			
 
				-static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle);
			
 
				+static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle);
			
 
				 
			
 
				-static struct starpu_data_interface_ops_t interface_csr_ops = {
			
 
				+static struct starpu_data_interface_ops interface_csr_ops =
			
 
				+{
			
 
				 	.register_data_handle = register_csr_handle,
			
 
				 	.allocate_data_on_node = allocate_csr_buffer_on_node,
			
 
				 	.free_data_on_node = free_csr_buffer_on_node,
			
 
				 	.copy_methods = &csr_copy_data_methods_s,
			
 
				 	.get_size = csr_interface_get_size,
			
 
				 	.interfaceid = STARPU_CSR_INTERFACE_ID,
			
 
				-	.interface_size = sizeof(starpu_csr_interface_t),
			
 
				+	.interface_size = sizeof(struct starpu_csr_interface),
			
 
				 	.footprint = footprint_csr_interface_crc32,
			
 
				-	.compare = csr_compare
			
 
				+	.compare = csr_compare,
			
 
				 };
			
 
				 
			
 
				-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+static void register_csr_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface;
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *) data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		starpu_csr_interface_t *local_interface = (starpu_csr_interface_t *)
			
 
				+		struct starpu_csr_interface *local_interface = (struct starpu_csr_interface *)
			
 
				 			starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-		if (node == home_node) {
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				 			local_interface->nzval = csr_interface->nzval;
			
 
				 			local_interface->colind = csr_interface->colind;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			local_interface->nzval = 0;
			
 
				 			local_interface->colind = NULL;
			
 
				 		}
			
@@ -111,10 +115,11 @@ static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, v
 
				 }
			
 
				 
			
 
				 /* declare a new data with the BLAS interface */
			
 
				-void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				+void starpu_csr_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
			
 
				 		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
			
 
				 {
			
 
				-	starpu_csr_interface_t csr_interface = {
			
 
				+	struct starpu_csr_interface csr_interface =
			
 
				+	{
			
 
				 		.nnz = nnz,
			
 
				 		.nrow = nrow,
			
 
				 		.nzval = nzval,
			
@@ -127,15 +132,15 @@ void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 
				 	starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
			
 
				 }
			
 
				 
			
 
				-static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
			
 
				+static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	return _starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
			
 
				+	return starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
			
 
				 }
			
 
				 
			
 
				 static int csr_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_a = (starpu_csr_interface_t *) data_interface_a;
			
 
				-	starpu_csr_interface_t *csr_b = (starpu_csr_interface_t *) data_interface_b;
			
 
				+	struct starpu_csr_interface *csr_a = (struct starpu_csr_interface *) data_interface_a;
			
 
				+	struct starpu_csr_interface *csr_b = (struct starpu_csr_interface *) data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((csr_a->nnz == csr_b->nnz)
			
@@ -144,78 +149,78 @@ static int csr_compare(void *data_interface_a, void *data_interface_b)
 
				 }
			
 
				 
			
 
				 /* offer an access to the data parameters */
			
 
				-uint32_t starpu_csr_get_nnz(starpu_data_handle handle)
			
 
				+uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return csr_interface->nnz;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_csr_get_nrow(starpu_data_handle handle)
			
 
				+uint32_t starpu_csr_get_nrow(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return csr_interface->nrow;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_csr_get_firstentry(starpu_data_handle handle)
			
 
				+uint32_t starpu_csr_get_firstentry(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return csr_interface->firstentry;
			
 
				 }
			
 
				 
			
 
				-size_t starpu_csr_get_elemsize(starpu_data_handle handle)
			
 
				+size_t starpu_csr_get_elemsize(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return csr_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				-uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
			
 
				+uintptr_t starpu_csr_get_local_nzval(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return csr_interface->nzval;
			
 
				 }
			
 
				 
			
 
				-uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
			
 
				+uint32_t *starpu_csr_get_local_colind(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return csr_interface->colind;
			
 
				 }
			
 
				 
			
 
				-uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
			
 
				+uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return csr_interface->rowptr;
			
 
				 }
			
 
				 
			
 
				-static size_t csr_interface_get_size(starpu_data_handle handle)
			
 
				+static size_t csr_interface_get_size(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	size_t size;
			
 
				 
			
@@ -238,15 +243,16 @@ static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_n
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				 	/* we need the 3 arrays to be allocated */
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface_;
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *) data_interface_;
			
 
				 
			
 
				 	uint32_t nnz = csr_interface->nnz;
			
 
				 	uint32_t nrow = csr_interface->nrow;
			
 
				 	size_t elemsize = csr_interface->elemsize;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			addr_nzval = (uintptr_t)malloc(nnz*elemsize);
			
 
				 			if (!addr_nzval)
			
@@ -281,46 +287,52 @@ static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_n
 
				 	        case STARPU_OPENCL_RAM:
			
 
				 			{
			
 
				                                 int ret;
			
 
				-                                void *ptr;
			
 
				+				cl_mem ptr;
			
 
				 
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
			
 
				+                                ret = starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
			
 
				                                 addr_nzval = (uintptr_t)ptr;
			
 
				 				if (ret) goto fail_nzval;
			
 
				 
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				-                                addr_colind = ptr;
			
 
				+                                ret = starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+                                addr_colind = (void*) ptr;
			
 
				 				if (ret) goto fail_colind;
			
 
				 
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				-                                addr_rowptr = ptr;
			
 
				+                                ret = starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+                                addr_rowptr = (void*) ptr;
			
 
				 				if (ret) goto fail_rowptr;
			
 
				 
			
 
				 				break;
			
 
				 			}
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 	/* allocation succeeded */
			
 
				-	allocated_memory = 
			
 
				+	allocated_memory =
			
 
				 		nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				 	csr_interface->nzval = addr_nzval;
			
 
				 	csr_interface->colind = addr_colind;
			
 
				 	csr_interface->rowptr = addr_rowptr;
			
 
				-	
			
 
				+
			
 
				 	return allocated_memory;
			
 
				 
			
 
				 fail_rowptr:
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_colind);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			cudaFree((void*)addr_colind);
			
 
				+		{
			
 
				+			cudaError_t err;
			
 
				+			err = cudaFree((void*)addr_colind);
			
 
				+			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(err);
			
 
				 			break;
			
 
				+		}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 		case STARPU_OPENCL_RAM:
			
@@ -328,17 +340,23 @@ fail_rowptr:
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 fail_colind:
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_nzval);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			cudaFree((void*)addr_nzval);
			
 
				+		{
			
 
				+			cudaError_t err;
			
 
				+			err = cudaFree((void*)addr_nzval);
			
 
				+			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(err);
			
 
				 			break;
			
 
				+		}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 		case STARPU_OPENCL_RAM:
			
@@ -346,7 +364,7 @@ fail_colind:
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 fail_nzval:
			
@@ -357,10 +375,11 @@ fail_nzval:
 
				 
			
 
				 static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface;
			
 
				+	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *) data_interface;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				-	switch(kind) {
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(node);
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void*)csr_interface->nzval);
			
 
				 			free((void*)csr_interface->colind);
			
@@ -368,10 +387,19 @@ static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			cudaFree((void*)csr_interface->nzval);
			
 
				-			cudaFree((void*)csr_interface->colind);
			
 
				-			cudaFree((void*)csr_interface->rowptr);
			
 
				+		{
			
 
				+			cudaError_t err;
			
 
				+			err = cudaFree((void*)csr_interface->nzval);
			
 
				+			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(err);
			
 
				+			err = cudaFree((void*)csr_interface->colind);
			
 
				+			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(err);
			
 
				+			err = cudaFree((void*)csr_interface->rowptr);
			
 
				+			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(err);
			
 
				 			break;
			
 
				+		}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 		case STARPU_OPENCL_RAM:
			
@@ -381,15 +409,15 @@ static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_csr_interface_t *src_csr = src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+	struct starpu_csr_interface *src_csr = src_interface;
			
 
				+	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
@@ -409,15 +437,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				 {
			
 
				-	starpu_csr_interface_t *src_csr = src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+	struct starpu_csr_interface *src_csr = src_interface;
			
 
				+	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
@@ -427,10 +455,12 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 
				 
			
 
				 	int synchronous_fallback = 0;
			
 
				 
			
 
				+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		synchronous_fallback = 1;
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -444,6 +474,7 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 
				 	if (synchronous_fallback || cures != cudaSuccess)
			
 
				 	{
			
 
				 		synchronous_fallback = 1;
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -457,17 +488,20 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 
				 	if (synchronous_fallback || cures != cudaSuccess)
			
 
				 	{
			
 
				 		synchronous_fallback = 1;
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	if (synchronous_fallback)
			
 
				 	{
			
 
				-		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 		return 0;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		return -EAGAIN;
			
 
				 	}
			
 
				 }
			
@@ -475,15 +509,15 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 
				 static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-	starpu_csr_interface_t *src_csr = src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+	struct starpu_csr_interface *src_csr = src_interface;
			
 
				+	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
 
				 	size_t elemsize = src_csr->elemsize;
			
 
				 
			
 
				-	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				-	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+	int src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				 
			
@@ -499,7 +533,7 @@ static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 #else
			
@@ -512,8 +546,8 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 
				 				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-	starpu_csr_interface_t *src_csr = src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+	struct starpu_csr_interface *src_csr = src_interface;
			
 
				+	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
@@ -521,15 +555,17 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 
				 
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				-	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				-	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+	int src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				 
			
 
				 	int synchronous_fallback = 0;
			
 
				 
			
 
				+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		synchronous_fallback = 1;
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -543,6 +579,7 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 
				 	if (synchronous_fallback || cures != cudaSuccess)
			
 
				 	{
			
 
				 		synchronous_fallback = 1;
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -556,17 +593,20 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 
				 	if (synchronous_fallback || cures != cudaSuccess)
			
 
				 	{
			
 
				 		synchronous_fallback = 1;
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	if (synchronous_fallback)
			
 
				 	{
			
 
				-		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 		return 0;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		return -EAGAIN;
			
 
				 	}
			
 
				 #else
			
@@ -617,8 +657,8 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_csr_interface_t *src_csr = src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+	struct starpu_csr_interface *src_csr = src_interface;
			
 
				+	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
@@ -626,27 +666,27 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
			
 
				+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_csr_interface_t *src_csr = src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+	struct starpu_csr_interface *src_csr = src_interface;
			
 
				+	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
@@ -654,19 +694,19 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
			
 
				+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, src_node, (cl_mem)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, src_node, (cl_mem)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, src_node, (cl_mem)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -675,8 +715,8 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_csr_interface_t *src_csr = (starpu_csr_interface_t *) src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = (starpu_csr_interface_t *) dst_interface;
			
 
				+	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
			
 
				+	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
			
 
				 
			
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
@@ -688,7 +728,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 
			
 
				 	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,18 +21,20 @@
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				 #include <common/uthash.h>
			
 
				 #include <common/starpu_spinlock.h>
			
 
				+#include <core/task.h>
			
 
				 
			
 
				 /* Entry in the `registered_handles' hash table.  */
			
 
				 struct handle_entry
			
 
				 {
			
 
				 	UT_hash_handle hh;
			
 
				 	void *pointer;
			
 
				-	starpu_data_handle handle;
			
 
				+	starpu_data_handle_t handle;
			
 
				 };
			
 
				 
			
 
				 /* Hash table mapping host pointers to data handles.  */
			
 
				 static struct handle_entry *registered_handles;
			
 
				-static starpu_spinlock_t    registered_handles_lock;
			
 
				+static struct _starpu_spinlock    registered_handles_lock;
			
 
				+static int _data_interface_number = STARPU_MAX_INTERFACE_ID;
			
 
				 
			
 
				 void _starpu_data_interface_init()
			
 
				 {
			
@@ -45,7 +47,8 @@ void _starpu_data_interface_shutdown()
 
				 
			
 
				 	_starpu_spin_destroy(&registered_handles_lock);
			
 
				 
			
 
				-	HASH_ITER(hh, registered_handles, entry, tmp) {
			
 
				+	HASH_ITER(hh, registered_handles, entry, tmp)
			
 
				+	{
			
 
				 		HASH_DEL(registered_handles, entry);
			
 
				 		free(entry);
			
 
				 	}
			
@@ -55,7 +58,7 @@ void _starpu_data_interface_shutdown()
 
				 
			
 
				 /* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
			
 
				  * some handle, the new mapping shadows the previous one.   */
			
 
				-void _starpu_data_register_ram_pointer(starpu_data_handle handle, void *ptr)
			
 
				+void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
			
 
				 {
			
 
				 	struct handle_entry *entry;
			
 
				 
			
@@ -70,9 +73,9 @@ void _starpu_data_register_ram_pointer(starpu_data_handle handle, void *ptr)
 
				 	_starpu_spin_unlock(&registered_handles_lock);
			
 
				 }
			
 
				 
			
 
				-starpu_data_handle starpu_data_lookup(const void *ptr)
			
 
				+starpu_data_handle_t starpu_data_lookup(const void *ptr)
			
 
				 {
			
 
				-	starpu_data_handle result;
			
 
				+	starpu_data_handle_t result;
			
 
				 
			
 
				 	_starpu_spin_lock(&registered_handles_lock);
			
 
				 	{
			
@@ -89,11 +92,16 @@ starpu_data_handle starpu_data_lookup(const void *ptr)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-/* 
			
 
				+int
			
 
				+_starpu_data_is_multiformat_handle(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	return handle->ops->is_multiformat;
			
 
				+}
			
 
				+/*
			
 
				  * Start monitoring a piece of data
			
 
				  */
			
 
				 
			
 
				-static void _starpu_register_new_data(starpu_data_handle handle,
			
 
				+static void _starpu_register_new_data(starpu_data_handle_t handle,
			
 
				 					uint32_t home_node, uint32_t wt_mask)
			
 
				 {
			
 
				 	void *ptr;
			
@@ -101,8 +109,12 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				 	/* initialize the new lock */
			
 
				-	handle->req_list = starpu_data_requester_list_new();
			
 
				+	handle->req_list = _starpu_data_requester_list_new();
			
 
				 	handle->refcnt = 0;
			
 
				+	handle->busy_count = 0;
			
 
				+	handle->busy_waiting = 0;
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&handle->busy_mutex, NULL);
			
 
				+	_STARPU_PTHREAD_COND_INIT(&handle->busy_cond, NULL);
			
 
				 	_starpu_spin_init(&handle->header_lock);
			
 
				 
			
 
				 	/* first take care to properly lock the data */
			
@@ -122,7 +134,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 	handle->sequential_consistency =
			
 
				 		starpu_data_get_default_sequential_consistency_flag();
			
 
				 
			
 
				-	PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
			
 
				 	handle->last_submitted_mode = STARPU_R;
			
 
				 	handle->last_submitted_writer = NULL;
			
 
				 	handle->last_submitted_readers = NULL;
			
@@ -134,7 +146,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 	handle->init_cl = NULL;
			
 
				 
			
 
				 	handle->reduction_refcnt = 0;
			
 
				-	handle->reduction_req_list = starpu_data_requester_list_new();
			
 
				+	handle->reduction_req_list = _starpu_data_requester_list_new();
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	handle->last_submitted_ghost_writer_id_is_valid = 0;
			
@@ -146,6 +158,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 
			
 
				 	/* Store some values directly in the handle not to recompute them all
			
 
				 	 * the time. */
			
 
				+	STARPU_ASSERT(handle->ops->get_size);
			
 
				 	handle->data_size = handle->ops->get_size(handle);
			
 
				 	handle->footprint = _starpu_compute_data_footprint(handle);
			
 
				 
			
@@ -156,20 +169,22 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *replicate;
			
 
				+		struct _starpu_data_replicate *replicate;
			
 
				 		replicate = &handle->per_node[node];
			
 
				-		
			
 
				+
			
 
				 		replicate->memory_node = node;
			
 
				 		replicate->relaxed_coherency = 0;
			
 
				 		replicate->refcnt = 0;
			
 
				 
			
 
				-		if (node == home_node) {
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				 			/* this is the home node with the only valid copy */
			
 
				 			replicate->state = STARPU_OWNER;
			
 
				 			replicate->allocated = 1;
			
 
				 			replicate->automatically_allocated = 0;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* the value is not available here yet */
			
 
				 			replicate->state = STARPU_INVALID;
			
 
				 			replicate->allocated = 0;
			
@@ -180,7 +195,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *replicate;
			
 
				+		struct _starpu_data_replicate *replicate;
			
 
				 		replicate = &handle->per_worker[worker];
			
 
				 		replicate->allocated = 0;
			
 
				 		replicate->automatically_allocated = 0;
			
@@ -194,6 +209,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 			replicate->request[node] = NULL;
			
 
				 		}
			
 
				 
			
 
				+		/* Assuming being used for SCRATCH for now, patched when entering REDUX mode */
			
 
				 		replicate->relaxed_coherency = 1;
			
 
				 		replicate->initialized = 0;
			
 
				 		replicate->memory_node = starpu_worker_get_memory_node(worker);
			
@@ -212,10 +228,9 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interface_ops_t *interface_ops)
			
 
				+static starpu_data_handle_t _starpu_data_handle_allocate(struct starpu_data_interface_ops *interface_ops)
			
 
				 {
			
 
				-	starpu_data_handle handle = (starpu_data_handle)
			
 
				-		calloc(1, sizeof(struct starpu_data_state_t));
			
 
				+	starpu_data_handle_t handle = (starpu_data_handle_t) calloc(1, sizeof(struct _starpu_data_state));
			
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
@@ -226,7 +241,16 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *replicate;
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+		/* Stats initilization */
			
 
				+		handle->stats_direct_access[node]=0;
			
 
				+		handle->stats_loaded_shared[node]=0;
			
 
				+		handle->stats_shared_to_owner[node]=0;
			
 
				+		handle->stats_loaded_owner[node]=0;
			
 
				+		handle->stats_invalidated[node]=0;
			
 
				+#endif
			
 
				+
			
 
				+		struct _starpu_data_replicate *replicate;
			
 
				 		replicate = &handle->per_node[node];
			
 
				 		/* relaxed_coherency = 0 */
			
 
				 
			
@@ -240,7 +264,7 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *replicate;
			
 
				+		struct _starpu_data_replicate *replicate;
			
 
				 		replicate = &handle->per_worker[worker];
			
 
				 
			
 
				 		replicate->handle = handle;
			
@@ -253,24 +277,31 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 
				 	return handle;
			
 
				 }
			
 
				 
			
 
				-void starpu_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				+void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
			
 
				 				void *data_interface,
			
 
				-				struct starpu_data_interface_ops_t *ops)
			
 
				+				struct starpu_data_interface_ops *ops)
			
 
				 {
			
 
				-	starpu_data_handle handle =
			
 
				+	starpu_data_handle_t handle =
			
 
				 		_starpu_data_handle_allocate(ops);
			
 
				 
			
 
				 	STARPU_ASSERT(handleptr);
			
 
				 	*handleptr = handle;
			
 
				-
			
 
				+	handle->mf_node = home_node;
			
 
				 
			
 
				 	/* fill the interface fields with the appropriate method */
			
 
				+	STARPU_ASSERT(ops->register_data_handle);
			
 
				 	ops->register_data_handle(handle, home_node, data_interface);
			
 
				 
			
 
				 	_starpu_register_new_data(handle, home_node, 0);
			
 
				 }
			
 
				 
			
 
				-void *starpu_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc)
			
 
				+{
			
 
				+	void *local_interface = starpu_data_get_interface_on_node(handlesrc, 0);
			
 
				+	starpu_data_register(handledst, -1, local_interface, handlesrc->ops);
			
 
				+}
			
 
				+
			
 
				+void *starpu_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				 	/* Check whether the operation is supported and the node has actually
			
 
				 	 * been allocated.  */
			
@@ -283,39 +314,39 @@ void *starpu_handle_to_pointer(starpu_data_handle handle, uint32_t node)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-void *starpu_handle_get_local_ptr(starpu_data_handle handle)
			
 
				+void *starpu_handle_get_local_ptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return starpu_handle_to_pointer(handle,
			
 
				 					_starpu_get_local_memory_node());
			
 
				 }
			
 
				 
			
 
				-int starpu_data_get_rank(starpu_data_handle handle)
			
 
				+int starpu_data_get_rank(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return handle->rank;
			
 
				 }
			
 
				 
			
 
				-int starpu_data_set_rank(starpu_data_handle handle, int rank)
			
 
				+int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
			
 
				 {
			
 
				         handle->rank = rank;
			
 
				         return 0;
			
 
				 }
			
 
				 
			
 
				-int starpu_data_get_tag(starpu_data_handle handle)
			
 
				+int starpu_data_get_tag(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return handle->tag;
			
 
				 }
			
 
				 
			
 
				-int starpu_data_set_tag(starpu_data_handle handle, int tag)
			
 
				+int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
			
 
				 {
			
 
				         handle->tag = tag;
			
 
				         return 0;
			
 
				 }
			
 
				 
			
 
				-/* 
			
 
				+/*
			
 
				  * Stop monitoring a piece of data
			
 
				  */
			
 
				 
			
 
				-void _starpu_data_free_interfaces(starpu_data_handle handle)
			
 
				+void _starpu_data_free_interfaces(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	const void *ram_ptr;
			
 
				 	unsigned node;
			
@@ -348,40 +379,55 @@ void _starpu_data_free_interfaces(starpu_data_handle handle)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-struct unregister_callback_arg {
			
 
				+struct _starpu_unregister_callback_arg
			
 
				+{
			
 
				 	unsigned memory_node;
			
 
				-	starpu_data_handle handle;
			
 
				+	starpu_data_handle_t handle;
			
 
				 	unsigned terminated;
			
 
				 	pthread_mutex_t mutex;
			
 
				 	pthread_cond_t cond;
			
 
				-}; 
			
 
				+};
			
 
				+
			
 
				+/* Check whether we should tell starpu_data_unregister that the data handle is
			
 
				+ * not busy any more.
			
 
				+ * The header is supposed to be locked */
			
 
				+void _starpu_data_check_not_busy(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	if (!handle->busy_count && handle->busy_waiting)
			
 
				+	{
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
			
 
				+		_STARPU_PTHREAD_COND_BROADCAST(&handle->busy_cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				 static void _starpu_data_unregister_fetch_data_callback(void *_arg)
			
 
				 {
			
 
				 	int ret;
			
 
				-	struct unregister_callback_arg *arg = (struct unregister_callback_arg *) _arg;
			
 
				+	struct _starpu_unregister_callback_arg *arg = (struct _starpu_unregister_callback_arg *) _arg;
			
 
				 
			
 
				-	starpu_data_handle handle = arg->handle;
			
 
				+	starpu_data_handle_t handle = arg->handle;
			
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	struct starpu_data_replicate_s *replicate = &handle->per_node[arg->memory_node];
			
 
				+	struct _starpu_data_replicate *replicate = &handle->per_node[arg->memory_node];
			
 
				 
			
 
				-	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, 0, NULL, NULL);
			
 
				+	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, 0, 0, NULL, NULL);
			
 
				 	STARPU_ASSERT(!ret);
			
 
				-	
			
 
				+
			
 
				 	/* unlock the caller */
			
 
				-	PTHREAD_MUTEX_LOCK(&arg->mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&arg->mutex);
			
 
				 	arg->terminated = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&arg->cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&arg->mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&arg->cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&arg->mutex);
			
 
				 }
			
 
				 
			
 
				 /* Unregister the data handle, perhaps we don't need to update the home_node
			
 
				  * (in that case coherent is set to 0) */
			
 
				-static void _starpu_data_unregister(starpu_data_handle handle, unsigned coherent)
			
 
				+static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned coherent)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				+	STARPU_ASSERT_MSG(handle->nchildren == 0, "data needs to be unpartitioned before unregistration");
			
 
				 
			
 
				 	if (coherent)
			
 
				 	{
			
@@ -390,69 +436,120 @@ static void _starpu_data_unregister(starpu_data_handle handle, unsigned coherent
 
				 
			
 
				 		/* Fetch data in the home of the data to ensure we have a valid copy
			
 
				 		 * where we registered it */
			
 
				-		int home_node = handle->home_node; 
			
 
				+		int home_node = handle->home_node;
			
 
				 		if (home_node >= 0)
			
 
				 		{
			
 
				-			struct unregister_callback_arg arg;
			
 
				+			struct _starpu_unregister_callback_arg arg;
			
 
				 			arg.handle = handle;
			
 
				 			arg.memory_node = (unsigned)home_node;
			
 
				 			arg.terminated = 0;
			
 
				-			PTHREAD_MUTEX_INIT(&arg.mutex, NULL);
			
 
				-			PTHREAD_COND_INIT(&arg.cond, NULL);
			
 
				-	
			
 
				+			_STARPU_PTHREAD_MUTEX_INIT(&arg.mutex, NULL);
			
 
				+			_STARPU_PTHREAD_COND_INIT(&arg.cond, NULL);
			
 
				+
			
 
				 			if (!_starpu_attempt_to_submit_data_request_from_apps(handle, STARPU_R,
			
 
				 					_starpu_data_unregister_fetch_data_callback, &arg))
			
 
				 			{
			
 
				 				/* no one has locked this data yet, so we proceed immediately */
			
 
				-				struct starpu_data_replicate_s *home_replicate = &handle->per_node[home_node];
			
 
				-				int ret = _starpu_fetch_data_on_node(handle, home_replicate, STARPU_R, 0, NULL, NULL);
			
 
				+				struct _starpu_data_replicate *home_replicate = &handle->per_node[home_node];
			
 
				+				int ret = _starpu_fetch_data_on_node(handle, home_replicate, STARPU_R, 0, 0, NULL, NULL);
			
 
				 				STARPU_ASSERT(!ret);
			
 
				 			}
			
 
				-			else {
			
 
				-				PTHREAD_MUTEX_LOCK(&arg.mutex);
			
 
				+			else
			
 
				+			{
			
 
				+				_STARPU_PTHREAD_MUTEX_LOCK(&arg.mutex);
			
 
				 				while (!arg.terminated)
			
 
				-					PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
			
 
				-				PTHREAD_MUTEX_UNLOCK(&arg.mutex);
			
 
				+					_STARPU_PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
			
 
				+				_STARPU_PTHREAD_MUTEX_UNLOCK(&arg.mutex);
			
 
				 			}
			
 
				+			_starpu_release_data_on_node(handle, 0, &handle->per_node[home_node]);
			
 
				+		}
			
 
				+
			
 
				+		/* If this handle uses a multiformat interface, we may have to convert
			
 
				+		 * this piece of data back into the CPU format.
			
 
				+		 * XXX : This is quite hacky, could we submit a task instead ?
			
 
				+		 */
			
 
				+		if (_starpu_data_is_multiformat_handle(handle) &&
			
 
				+			starpu_node_get_kind(handle->mf_node) != STARPU_CPU_RAM)
			
 
				+		{
			
 
				+			_STARPU_DEBUG("Conversion needed\n");
			
 
				+			void *buffers[1];
			
 
				+			struct starpu_multiformat_interface *format_interface;
			
 
				+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
			
 
				+			struct starpu_codelet *cl;
			
 
				+			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
			
 
				+
			
 
				+			struct starpu_multiformat_data_interface_ops *mf_ops;
			
 
				+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			
 
				+			switch (node_kind)
			
 
				+			{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+				case STARPU_CUDA_RAM:
			
 
				+					cl = mf_ops->cuda_to_cpu_cl;
			
 
				+					break;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+				case STARPU_OPENCL_RAM:
			
 
				+					cl = mf_ops->opencl_to_cpu_cl;
			
 
				+					break;
			
 
				+#endif
			
 
				+				case STARPU_CPU_RAM:      /* Impossible ! */
			
 
				+				case STARPU_SPU_LS:       /* Not supported */
			
 
				+				default:
			
 
				+					STARPU_ASSERT(0);
			
 
				+			}
			
 
				+			buffers[0] = format_interface;
			
 
				+
			
 
				+			_starpu_cl_func_t func = _starpu_task_get_cpu_nth_implementation(cl, 0);
			
 
				+			STARPU_ASSERT(func);
			
 
				+			func(buffers, NULL);
			
 
				 		}
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* Should we postpone the unregister operation ? */
			
 
				 		if ((handle->refcnt > 0) && handle->lazy_unregister)
			
 
				 			return;
			
 
				 	}
			
 
				 
			
 
				+	_starpu_spin_lock(&handle->header_lock);
			
 
				+	/* Tell holders of references that we're starting waiting */
			
 
				+	handle->busy_waiting = 1;
			
 
				+	_starpu_spin_unlock(&handle->header_lock);
			
 
				+
			
 
				+	/* Wait for all requests to finish (notably WT requests) */
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
			
 
				+	while (handle->busy_count)
			
 
				+		_STARPU_PTHREAD_COND_WAIT(&handle->busy_cond, &handle->busy_mutex);
			
 
				+
			
 
				+	/* Wait for finished requests to release the handle */
			
 
				+	_starpu_spin_lock(&handle->header_lock);
			
 
				 	_starpu_data_free_interfaces(handle);
			
 
				 
			
 
				 	/* Destroy the data now */
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *local = &handle->per_node[node];
			
 
				-
			
 
				-		if (local->allocated && local->automatically_allocated){
			
 
				-			/* free the data copy in a lazy fashion */
			
 
				-			_starpu_request_mem_chunk_removal(handle, node);
			
 
				-		}
			
 
				+		/* free the data copy in a lazy fashion */
			
 
				+		_starpu_request_mem_chunk_removal(handle, node);
			
 
				 	}
			
 
				 
			
 
				-	starpu_data_requester_list_delete(handle->req_list);
			
 
				-	starpu_data_requester_list_delete(handle->reduction_req_list);
			
 
				+	_starpu_data_requester_list_delete(handle->req_list);
			
 
				+	_starpu_data_requester_list_delete(handle->reduction_req_list);
			
 
				 
			
 
				 	free(handle);
			
 
				 }
			
 
				 
			
 
				-void starpu_data_unregister(starpu_data_handle handle)
			
 
				+void starpu_data_unregister(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	_starpu_data_unregister(handle, 1);
			
 
				 }
			
 
				 
			
 
				-void starpu_data_unregister_no_coherency(starpu_data_handle handle)
			
 
				+void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	_starpu_data_unregister(handle, 0);
			
 
				 }
			
 
				 
			
 
				-void starpu_data_invalidate(starpu_data_handle handle)
			
 
				+void starpu_data_invalidate(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
@@ -463,14 +560,15 @@ void starpu_data_invalidate(starpu_data_handle handle)
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *local = &handle->per_node[node];
			
 
				+		struct _starpu_data_replicate *local = &handle->per_node[node];
			
 
				 
			
 
				-		if (local->allocated && local->automatically_allocated){
			
 
				+		if (local->allocated && local->automatically_allocated)
			
 
				+		{
			
 
				 			/* free the data copy in a lazy fashion */
			
 
				 			_starpu_request_mem_chunk_removal(handle, node);
			
 
				 		}
			
 
				 
			
 
				-		local->state = STARPU_INVALID; 
			
 
				+		local->state = STARPU_INVALID;
			
 
				 	}
			
 
				 
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
@@ -478,12 +576,18 @@ void starpu_data_invalidate(starpu_data_handle handle)
 
				 	starpu_data_release(handle);
			
 
				 }
			
 
				 
			
 
				-unsigned starpu_get_handle_interface_id(starpu_data_handle handle)
			
 
				+enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return handle->ops->interfaceid;
			
 
				 }
			
 
				 
			
 
				-void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memory_node)
			
 
				+void *starpu_data_get_interface_on_node(starpu_data_handle_t handle, unsigned memory_node)
			
 
				 {
			
 
				 	return handle->per_node[memory_node].data_interface;
			
 
				 }
			
 
				+
			
 
				+int starpu_data_interface_get_next_id()
			
 
				+{
			
 
				+	_data_interface_number += 1;
			
 
				+	return _data_interface_number-1;
			
 
				+}
			
--- a/src/datawizard/interfaces/data_interface.h
+++ b/src/datawizard/interfaces/data_interface.h
@@ -22,15 +22,18 @@
 
				 #include <common/config.h>
			
 
				 
			
 
				 /* Some data interfaces or filters use this interface internally */
			
 
				-extern struct starpu_data_interface_ops_t _starpu_interface_matrix_ops;
			
 
				-void _starpu_data_free_interfaces(starpu_data_handle handle)
			
 
				+extern struct starpu_data_interface_ops _starpu_interface_matrix_ops;
			
 
				+void _starpu_data_free_interfaces(starpu_data_handle_t handle)
			
 
				 	STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				 extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
			
 
				+extern void _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
			
 
				 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				-extern void _starpu_data_register_ram_pointer(starpu_data_handle handle,
			
 
				+extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
			
 
				 						void *ptr)
			
 
				 	STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				+extern int _starpu_data_is_multiformat_handle(starpu_data_handle_t handle);
			
 
				+
			
 
				 #endif // __DATA_INTERFACE_H__
			
--- a/src/datawizard/interfaces/matrix_filters.c
+++ b/src/datawizard/interfaces/matrix_filters.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -25,9 +25,9 @@
 
				  */
			
 
				 void starpu_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				-       starpu_matrix_interface_t *matrix_father = (starpu_matrix_interface_t *) father_interface;
			
 
				-       starpu_matrix_interface_t *matrix_child = (starpu_matrix_interface_t *) child_interface;
			
 
				-  
			
 
				+	struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
			
 
				+	struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
			
 
				+
			
 
				 	uint32_t nx = matrix_father->nx;
			
 
				 	uint32_t ny = matrix_father->ny;
			
 
				 	size_t elemsize = matrix_father->elemsize;
			
@@ -36,28 +36,30 @@ void starpu_block_filter_func(void *father_interface, void *child_interface, STA
 
				 
			
 
				 	size_t chunk_size = ((size_t)nx + nchunks - 1)/nchunks;
			
 
				 	size_t offset = (size_t)id*chunk_size*elemsize;
			
 
				-	
			
 
				-	uint32_t child_nx = 
			
 
				+
			
 
				+	uint32_t child_nx =
			
 
				 	  STARPU_MIN(chunk_size, (size_t)nx - (size_t)id*chunk_size);
			
 
				-	
			
 
				+
			
 
				 	/* update the child's interface */
			
 
				 	matrix_child->nx = child_nx;
			
 
				 	matrix_child->ny = ny;
			
 
				 	matrix_child->elemsize = elemsize;
			
 
				-	
			
 
				+
			
 
				 	/* is the information on this node valid ? */
			
 
				-	if (matrix_father->ptr) {
			
 
				-	  matrix_child->ptr = matrix_father->ptr + offset;
			
 
				-	  matrix_child->ld = matrix_father->ld;
			
 
				-	  matrix_child->dev_handle = matrix_father->dev_handle;
			
 
				-	  matrix_child->offset = matrix_father->offset + offset;
			
 
				+	if (matrix_father->dev_handle)
			
 
				+	{
			
 
				+		if (matrix_father->ptr)
			
 
				+			matrix_child->ptr = matrix_father->ptr + offset;
			
 
				+		matrix_child->ld = matrix_father->ld;
			
 
				+		matrix_child->dev_handle = matrix_father->dev_handle;
			
 
				+		matrix_child->offset = matrix_father->offset + offset;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				-        starpu_matrix_interface_t *matrix_father = (starpu_matrix_interface_t *) father_interface;
			
 
				-        starpu_matrix_interface_t *matrix_child = (starpu_matrix_interface_t *) child_interface;
			
 
				+        struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
			
 
				+        struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
			
 
				 
			
 
				 	uint32_t nx = matrix_father->nx;
			
 
				 	uint32_t ny = matrix_father->ny;
			
@@ -66,7 +68,7 @@ void starpu_vertical_block_filter_func(void *father_interface, void *child_inter
 
				 	STARPU_ASSERT(nchunks <= ny);
			
 
				 
			
 
				 	size_t chunk_size = ((size_t)ny + nchunks - 1)/nchunks;
			
 
				-	size_t child_ny = 
			
 
				+	size_t child_ny =
			
 
				 	  STARPU_MIN(chunk_size, (size_t)ny - (size_t)id*chunk_size);
			
 
				 
			
 
				 	matrix_child->nx = nx;
			
@@ -74,12 +76,13 @@ void starpu_vertical_block_filter_func(void *father_interface, void *child_inter
 
				 	matrix_child->elemsize = elemsize;
			
 
				 
			
 
				 	/* is the information on this node valid ? */
			
 
				-	if (matrix_father->ptr) {
			
 
				-	  size_t offset = 
			
 
				-	    (size_t)id*chunk_size*matrix_father->ld*elemsize;
			
 
				-	  matrix_child->ptr = matrix_father->ptr + offset;
			
 
				-	  matrix_child->ld = matrix_father->ld;
			
 
				-	  matrix_child->dev_handle = matrix_father->dev_handle;
			
 
				-	  matrix_child->offset = matrix_father->offset + offset;
			
 
				+	if (matrix_father->dev_handle)
			
 
				+	{
			
 
				+		size_t offset = (size_t)id*chunk_size*matrix_father->ld*elemsize;
			
 
				+		if (matrix_father->ptr)
			
 
				+			matrix_child->ptr = matrix_father->ptr + offset;
			
 
				+		matrix_child->ld = matrix_father->ld;
			
 
				+		matrix_child->dev_handle = matrix_father->dev_handle;
			
 
				+		matrix_child->offset = matrix_father->offset + offset;
			
 
				 	}
			
 
				 }
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,11 +20,14 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/filters.h>
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				+/* If you can promise that there is no stride in your matrices, you can define this */
			
 
				+// #define NO_STRIDE
			
 
				+
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
@@ -32,7 +35,9 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-//static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+#ifdef NO_STRIDE
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
@@ -41,7 +46,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				 #endif
			
 
				 
			
 
				-static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
			
 
				+static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
			
 
				+{
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 	.ram_to_spu = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -50,7 +56,9 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-//	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#ifdef NO_STRIDE
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -64,19 +72,20 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				-static void *matrix_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static void register_matrix_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				+static void *matrix_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
			
 
				 static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				 static void free_matrix_buffer_on_node(void *data_interface, uint32_t node);
			
 
				-static size_t matrix_interface_get_size(starpu_data_handle handle);
			
 
				-static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle);
			
 
				+static size_t matrix_interface_get_size(starpu_data_handle_t handle);
			
 
				+static uint32_t footprint_matrix_interface_crc32(starpu_data_handle_t handle);
			
 
				 static int matrix_compare(void *data_interface_a, void *data_interface_b);
			
 
				-static void display_matrix_interface(starpu_data_handle handle, FILE *f);
			
 
				+static void display_matrix_interface(starpu_data_handle_t handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
			
 
				 #endif
			
 
				 
			
 
				-struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
			
 
				+struct starpu_data_interface_ops _starpu_interface_matrix_ops =
			
 
				+{
			
 
				 	.register_data_handle = register_matrix_handle,
			
 
				 	.allocate_data_on_node = allocate_matrix_buffer_on_node,
			
 
				 	.handle_to_pointer = matrix_handle_to_pointer,
			
@@ -88,13 +97,13 @@ struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	.convert_to_gordon = convert_matrix_to_gordon,
			
 
				 #endif
			
 
				-	.interfaceid = STARPU_MATRIX_INTERFACE_ID, 
			
 
				-	.interface_size = sizeof(starpu_matrix_interface_t),
			
 
				-	.display = display_matrix_interface
			
 
				+	.interfaceid = STARPU_MATRIX_INTERFACE_ID,
			
 
				+	.interface_size = sizeof(struct starpu_matrix_interface),
			
 
				+	.display = display_matrix_interface,
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
			
 
				 {
			
 
				 	size_t elemsize = GET_MATRIX_ELEMSIZE(interface);
			
 
				 	uint32_t nx = STARPU_MATRIX_GET_NX(interface);
			
@@ -111,23 +120,25 @@ static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+static void register_matrix_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *) data_interface;
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *) data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		starpu_matrix_interface_t *local_interface = (starpu_matrix_interface_t *)
			
 
				+		struct starpu_matrix_interface *local_interface = (struct starpu_matrix_interface *)
			
 
				 			starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-		if (node == home_node) {
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				 			local_interface->ptr = matrix_interface->ptr;
			
 
				                         local_interface->dev_handle = matrix_interface->dev_handle;
			
 
				                         local_interface->offset = matrix_interface->offset;
			
 
				 			local_interface->ld  = matrix_interface->ld;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			local_interface->ptr = 0;
			
 
				 			local_interface->dev_handle = 0;
			
 
				 			local_interface->offset = 0;
			
@@ -140,11 +151,11 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void *matrix_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+static void *matrix_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return (void*) matrix_interface->ptr;
			
@@ -152,11 +163,12 @@ static void *matrix_handle_to_pointer(starpu_data_handle handle, uint32_t node)
 
				 
			
 
				 
			
 
				 /* declare a new data with the matrix interface */
			
 
				-void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				+void starpu_matrix_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
			
 
				 			uintptr_t ptr, uint32_t ld, uint32_t nx,
			
 
				 			uint32_t ny, size_t elemsize)
			
 
				 {
			
 
				-	starpu_matrix_interface_t matrix_interface = {
			
 
				+	struct starpu_matrix_interface matrix_interface =
			
 
				+	{
			
 
				 		.ptr = ptr,
			
 
				 		.ld = ld,
			
 
				 		.nx = nx,
			
@@ -169,15 +181,15 @@ void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_no
 
				 	starpu_data_register(handleptr, home_node, &matrix_interface, &_starpu_interface_matrix_ops);
			
 
				 }
			
 
				 
			
 
				-static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle)
			
 
				+static uint32_t footprint_matrix_interface_crc32(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	return _starpu_crc32_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
			
 
				+	return starpu_crc32_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
			
 
				 }
			
 
				 
			
 
				 static int matrix_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_a = (starpu_matrix_interface_t *) data_interface_a;
			
 
				-	starpu_matrix_interface_t *matrix_b = (starpu_matrix_interface_t *) data_interface_b;
			
 
				+	struct starpu_matrix_interface *matrix_a = (struct starpu_matrix_interface *) data_interface_a;
			
 
				+	struct starpu_matrix_interface *matrix_b = (struct starpu_matrix_interface *) data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((matrix_a->nx == matrix_b->nx)
			
@@ -185,71 +197,71 @@ static int matrix_compare(void *data_interface_a, void *data_interface_b)
 
				 			&& (matrix_a->elemsize == matrix_b->elemsize));
			
 
				 }
			
 
				 
			
 
				-static void display_matrix_interface(starpu_data_handle handle, FILE *f)
			
 
				+static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
			
 
				 }
			
 
				 
			
 
				-static size_t matrix_interface_get_size(starpu_data_handle handle)
			
 
				+static size_t matrix_interface_get_size(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	size_t size;
			
 
				-	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize; 
			
 
				+	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize;
			
 
				 
			
 
				 	return size;
			
 
				 }
			
 
				 
			
 
				 /* offer an access to the data parameters */
			
 
				-uint32_t starpu_matrix_get_nx(starpu_data_handle handle)
			
 
				+uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return matrix_interface->nx;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_matrix_get_ny(starpu_data_handle handle)
			
 
				+uint32_t starpu_matrix_get_ny(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return matrix_interface->ny;
			
 
				 }
			
 
				 
			
 
				-uint32_t starpu_matrix_get_local_ld(starpu_data_handle handle)
			
 
				+uint32_t starpu_matrix_get_local_ld(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return matrix_interface->ld;
			
 
				 }
			
 
				 
			
 
				-uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle handle)
			
 
				+uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return matrix_interface->ptr;
			
 
				 }
			
 
				 
			
 
				-size_t starpu_matrix_get_elemsize(starpu_data_handle handle)
			
 
				+size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *)
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return matrix_interface->elemsize;
			
@@ -260,7 +272,7 @@ size_t starpu_matrix_get_elemsize(starpu_data_handle handle)
 
				 /* returns the size of the allocated area */
			
 
				 static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				-	uintptr_t addr = 0;
			
 
				+	uintptr_t addr = 0, handle = 0;
			
 
				 	unsigned fail = 0;
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
@@ -268,19 +280,20 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				 
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *) data_interface_;
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *) data_interface_;
			
 
				 
			
 
				 	uint32_t nx = matrix_interface->nx;
			
 
				 	uint32_t ny = matrix_interface->ny;
			
 
				 	uint32_t ld = nx; // by default
			
 
				 	size_t elemsize = matrix_interface->elemsize;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				-			addr = (uintptr_t)malloc((size_t)nx*ny*elemsize);
			
 
				-			if (!addr) 
			
 
				+			handle = addr = (uintptr_t)malloc((size_t)nx*ny*elemsize);
			
 
				+			if (!addr)
			
 
				 				fail = 1;
			
 
				 
			
 
				 			break;
			
@@ -291,9 +304,10 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 
				 			{
			
 
				 				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
			
 
				 					 STARPU_CUDA_REPORT_ERROR(status);
			
 
				-					
			
 
				+
			
 
				 				fail = 1;
			
 
				 			}
			
 
				+			handle = addr;
			
 
				 
			
 
				 			ld = nx;
			
 
				 
			
@@ -303,52 +317,57 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 
				 	        case STARPU_OPENCL_RAM:
			
 
				 			{
			
 
				                                 int ret;
			
 
				-                                void *ptr;
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*elemsize, CL_MEM_READ_WRITE);
			
 
				-                                addr = (uintptr_t)ptr;
			
 
				-				if (ret) {
			
 
				+				cl_mem mem;
			
 
				+                                ret = starpu_opencl_allocate_memory(&mem, nx*ny*elemsize, CL_MEM_READ_WRITE);
			
 
				+				handle = (uintptr_t)mem;
			
 
				+				if (ret)
			
 
				+				{
			
 
				 					fail = 1;
			
 
				 				}
			
 
				 				break;
			
 
				 			}
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				-	if (!fail) {
			
 
				+	if (!fail)
			
 
				+	{
			
 
				 		/* allocation succeeded */
			
 
				 		allocated_memory = (size_t)nx*ny*elemsize;
			
 
				 
			
 
				 		/* update the data properly in consequence */
			
 
				 		matrix_interface->ptr = addr;
			
 
				-                matrix_interface->dev_handle = addr;
			
 
				+		matrix_interface->dev_handle = handle;
			
 
				                 matrix_interface->offset = 0;
			
 
				 		matrix_interface->ld = ld;
			
 
				-	} else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		/* allocation failed */
			
 
				 		allocated_memory = -ENOMEM;
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				 static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = (starpu_matrix_interface_t *) data_interface;
			
 
				+	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *) data_interface;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				-	switch(kind) {
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(node);
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void*)matrix_interface->ptr);
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			status = cudaFree((void*)matrix_interface->ptr);			
			
 
				+			status = cudaFree((void*)matrix_interface->ptr);
			
 
				 			if (STARPU_UNLIKELY(status))
			
 
				 				STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
@@ -356,36 +375,37 @@ static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                 case STARPU_OPENCL_RAM:
			
 
				-                        clReleaseMemObject((void *)matrix_interface->ptr);
			
 
				+			clReleaseMemObject((void *)matrix_interface->dev_handle);
			
 
				                         break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				+	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				+	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				 
			
 
				 	size_t elemsize = src_matrix->elemsize;
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				 #if 0
			
 
				-
			
 
				 	struct cudaMemcpy3DParms p;
			
 
				 	memset(&p, 0, sizeof(p));
			
 
				 
			
 
				-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * src_matrix->ny *elemsize, src_matrix->ny);
			
 
				-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * src_matrix->ny *elemsize, dst_matrix->ny);
			
 
				-	p.extent = make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
			
 
				+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * elemsize, src_matrix->ny);
			
 
				+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * elemsize, dst_matrix->ny);
			
 
				+	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
			
 
				 	p.kind = kind;
			
 
				 
			
 
				 	if (is_async)
			
 
				 	{
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpy3DAsync(&p, stream);
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		if (!cures)
			
 
				 			return -EAGAIN;
			
 
				 	}
			
@@ -393,13 +413,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 	cures = cudaMemcpy3D(&p);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-#endif
			
 
				+#else
			
 
				 
			
 
				 	if (is_async)
			
 
				 	{
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				 			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		if (!cures)
			
 
				 			return -EAGAIN;
			
 
				 	}
			
@@ -409,26 +431,29 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 		src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#endif
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-/* XXX this is broken : we need to find a way to fix that ! */
			
 
				-#if 0
			
 
				+/* XXX this is broken : We need to properly call cudaDeviceEnablePeerAccess(), and avoid crossing NUMA nodes... */
			
 
				+#ifdef NO_STRIDE
			
 
				 static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				+	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				+	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				 
			
 
				 	size_t elemsize = src_matrix->elemsize;
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				-#if 1
			
 
				-	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				-	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+	int src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				 
			
 
				+#if 0
			
 
				+	/* That code is not even working!! */
			
 
				 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
			
 
				 
			
 
				 	cures = cudaSetDevice(src_dev);
			
@@ -455,21 +480,55 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
				 	p.dstPtr = mem_device2;
			
 
				 	p.extent = extent;
			
 
				 
			
 
				+	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
			
 
				+	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
			
 
				+	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
			
 
				+	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
			
 
				+	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
			
 
				+	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
			
 
				+	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
			
 
				 	cures = cudaMemcpy3DPeer(&p);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	        STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#endif
			
 
				+
			
 
				+#if 0
			
 
				+	struct cudaMemcpy3DPeerParms p;
			
 
				+	memset(&p, 0, sizeof(p));
			
 
				+
			
 
				+	p.srcDevice = src_dev;
			
 
				+	p.dstDevice = dst_dev;
			
 
				+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx * elemsize, src_matrix->ny);
			
 
				+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx * elemsize, dst_matrix->ny);
			
 
				+	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
			
 
				+
			
 
				+#if 1
			
 
				+	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
			
 
				+	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
			
 
				+	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
			
 
				+	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
			
 
				+	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
			
 
				+	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
			
 
				+	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
			
 
				+#endif
			
 
				 
			
 
				+	cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	cudaThreadSynchronize();
			
 
				 
			
 
				-//make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
			
 
				-//make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, src_matrix->nx, dst_matrix->ny);
			
 
				-//make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+		cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				 
			
 
				-//	if (is_async)
			
 
				-//	{
			
 
				-//		cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				-//		if (!cures)
			
 
				-//			return -EAGAIN;
			
 
				-//	}
			
 
				+	cures = cudaMemcpy3DPeer(&p);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 #else
			
 
				 	/* XXX FIXME !!*/
			
@@ -478,7 +537,9 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
				 
			
 
				 	if (is_async)
			
 
				 	{
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		if (!cures)
			
 
				 			return -EAGAIN;
			
 
				 	}
			
@@ -488,7 +549,7 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 #endif
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -526,7 +587,7 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
			
 
				 }
			
 
				 
			
 
				-#if 0
			
 
				+#ifdef NO_STRIDE
			
 
				 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				 {
			
 
				 	if (src_node == dst_node)
			
@@ -540,41 +601,41 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				+	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				+	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				         int err,ret;
			
 
				 
			
 
				 	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
			
 
				 	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
			
 
				 
			
 
				-	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle,
			
 
				+	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
			
 
				                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				                                                            dst_matrix->offset, (cl_event*)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				+	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				+	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				 	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
			
 
				 	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
			
 
				 
			
 
				-        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
			
 
				+        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
			
 
				                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				                                                            src_matrix->offset, (cl_event*)_event, &ret);
			
 
				 
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -594,8 +655,8 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
				 /* as not all platform easily have a  lib installed ... */
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *src_matrix = (starpu_matrix_interface_t *) src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = (starpu_matrix_interface_t *) dst_interface;
			
 
				+	struct starpu_matrix_interface *src_matrix = (struct starpu_matrix_interface *) src_interface;
			
 
				+	struct starpu_matrix_interface *dst_matrix = (struct starpu_matrix_interface *) dst_interface;
			
 
				 
			
 
				 	unsigned y;
			
 
				 	uint32_t nx = dst_matrix->nx;
			
@@ -614,11 +675,11 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 		uint32_t src_offset = y*ld_src*elemsize;
			
 
				 		uint32_t dst_offset = y*ld_dst*elemsize;
			
 
				 
			
 
				-		memcpy((void *)(ptr_dst + dst_offset), 
			
 
				+		memcpy((void *)(ptr_dst + dst_offset),
			
 
				 			(void *)(ptr_src + src_offset), nx*elemsize);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -0,0 +1,724 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011-2012  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <common/config.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy_driver.h>
			
 
				+#include <datawizard/filters.h>
			
 
				+#include <starpu_hash.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#include <core/task.h>
			
 
				+
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
			
 
				+#endif
			
 
				+
			
 
				+static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
			
 
				+{
			
 
				+	.ram_to_ram = copy_ram_to_ram,
			
 
				+	.ram_to_spu = NULL,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.ram_to_cuda = copy_ram_to_cuda,
			
 
				+	.cuda_to_ram = copy_cuda_to_ram,
			
 
				+	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				+	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram = copy_opencl_to_ram,
			
 
				+	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				+        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				+	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				+#endif
			
 
				+	.cuda_to_spu = NULL,
			
 
				+	.spu_to_ram = NULL,
			
 
				+	.spu_to_cuda = NULL,
			
 
				+	.spu_to_spu = NULL
			
 
				+};
			
 
				+
			
 
				+static void register_multiformat_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				+static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				+static void *multiformat_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
			
 
				+static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node);
			
 
				+static size_t multiformat_interface_get_size(starpu_data_handle_t handle);
			
 
				+static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle);
			
 
				+static int multiformat_compare(void *data_interface_a, void *data_interface_b);
			
 
				+static void display_multiformat_interface(starpu_data_handle_t handle, FILE *f);
			
 
				+static uint32_t starpu_multiformat_get_nx(starpu_data_handle_t handle);
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+static int convert_multiformat_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_multiformat_data_interface_ops*
			
 
				+get_mf_ops(void *data_interface)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *mf;
			
 
				+	mf = (struct starpu_multiformat_interface *) data_interface;
			
 
				+
			
 
				+	return mf->ops;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_data_interface_ops interface_multiformat_ops =
			
 
				+{
			
 
				+	.register_data_handle  = register_multiformat_handle,
			
 
				+	.allocate_data_on_node = allocate_multiformat_buffer_on_node,
			
 
				+	.handle_to_pointer     = multiformat_handle_to_pointer,
			
 
				+	.free_data_on_node     = free_multiformat_buffer_on_node,
			
 
				+	.copy_methods          = &multiformat_copy_data_methods_s,
			
 
				+	.get_size              = multiformat_interface_get_size,
			
 
				+	.footprint             = footprint_multiformat_interface_crc32,
			
 
				+	.compare               = multiformat_compare,
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+	.convert_to_gordon     = NULL,
			
 
				+#endif
			
 
				+	.interfaceid           = STARPU_MULTIFORMAT_INTERFACE_ID,
			
 
				+	.interface_size        = sizeof(struct starpu_multiformat_interface),
			
 
				+	.display               = display_multiformat_interface,
			
 
				+	.is_multiformat        = 1,
			
 
				+	.get_mf_ops            = get_mf_ops
			
 
				+};
			
 
				+
			
 
				+static void *multiformat_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+	struct starpu_multiformat_interface *multiformat_interface =
			
 
				+		(struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	switch(starpu_node_get_kind(node))
			
 
				+	{
			
 
				+		case STARPU_CPU_RAM:
			
 
				+			return multiformat_interface->cpu_ptr;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+			return multiformat_interface->cuda_ptr;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			return multiformat_interface->opencl_ptr;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ASSERT(0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void register_multiformat_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *multiformat_interface;
			
 
				+	multiformat_interface = (struct starpu_multiformat_interface *) data_interface;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+	{
			
 
				+		struct starpu_multiformat_interface *local_interface =
			
 
				+			(struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				+			local_interface->cpu_ptr    = multiformat_interface->cpu_ptr;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			local_interface->cuda_ptr   = multiformat_interface->cuda_ptr;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
			
 
				+#endif
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			local_interface->cpu_ptr    = NULL;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			local_interface->cuda_ptr   = NULL;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			local_interface->opencl_ptr = NULL;
			
 
				+#endif
			
 
				+		}
			
 
				+		local_interface->nx = multiformat_interface->nx;
			
 
				+		local_interface->ops = multiformat_interface->ops;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void starpu_multiformat_data_register(starpu_data_handle_t *handleptr,
			
 
				+				      uint32_t home_node,
			
 
				+				      void *ptr,
			
 
				+				      uint32_t nobjects,
			
 
				+				      struct starpu_multiformat_data_interface_ops *format_ops)
			
 
				+{
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_opencl_cl);
			
 
				+	_starpu_codelet_check_deprecated_fields(format_ops->opencl_to_cpu_cl);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_cuda_cl);
			
 
				+	_starpu_codelet_check_deprecated_fields(format_ops->cuda_to_cpu_cl);
			
 
				+#endif
			
 
				+
			
 
				+	struct starpu_multiformat_interface multiformat =
			
 
				+	{
			
 
				+		.cpu_ptr    = ptr,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		.cuda_ptr   = NULL,
			
 
				+#endif
			
 
				+#ifdef STARPu_USE_OPENCL
			
 
				+		.opencl_ptr = NULL,
			
 
				+#endif
			
 
				+		.nx         = nobjects,
			
 
				+		.ops        = format_ops
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_register(handleptr, home_node, &multiformat, &interface_multiformat_ops);
			
 
				+}
			
 
				+
			
 
				+static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	return starpu_crc32_be(starpu_multiformat_get_nx(handle), 0);
			
 
				+}
			
 
				+
			
 
				+static int multiformat_compare(void *data_interface_a, void *data_interface_b)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *multiformat_a = (struct starpu_multiformat_interface *) data_interface_a;
			
 
				+	struct starpu_multiformat_interface *multiformat_b = (struct starpu_multiformat_interface *) data_interface_b;
			
 
				+
			
 
				+	return ((multiformat_a->nx == multiformat_b->nx)
			
 
				+			&& (multiformat_a->ops->cpu_elemsize == multiformat_b->ops->cpu_elemsize)
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			&& (multiformat_a->ops->cuda_elemsize == multiformat_b->ops->cuda_elemsize)
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			&& (multiformat_a->ops->opencl_elemsize == multiformat_b->ops->opencl_elemsize)
			
 
				+#endif
			
 
				+		);
			
 
				+}
			
 
				+
			
 
				+static void display_multiformat_interface(starpu_data_handle_t handle, FILE *f)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	struct starpu_multiformat_interface *multiformat_interface;
			
 
				+	multiformat_interface = (struct starpu_multiformat_interface *)
			
 
				+		starpu_data_get_interface_on_node(handle, 0);
			
 
				+
			
 
				+	fprintf(f, "%u\t", multiformat_interface->nx);
			
 
				+}
			
 
				+
			
 
				+/* XXX : returns CPU size */
			
 
				+static size_t multiformat_interface_get_size(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	size_t size;
			
 
				+	struct starpu_multiformat_interface *multiformat_interface;
			
 
				+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
			
 
				+	size = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_multiformat_get_nx(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *multiformat_interface;
			
 
				+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
			
 
				+	return multiformat_interface->nx;
			
 
				+}
			
 
				+
			
 
				+static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *multiformat_interface;
			
 
				+	multiformat_interface = (struct starpu_multiformat_interface *) data_interface;
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(node);
			
 
				+
			
 
				+	switch(kind)
			
 
				+	{
			
 
				+		case STARPU_CPU_RAM:
			
 
				+			free(multiformat_interface->cpu_ptr);
			
 
				+			multiformat_interface->cpu_ptr = NULL;
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+			if (multiformat_interface->cpu_ptr)
			
 
				+			{
			
 
				+				cudaFree(multiformat_interface->cpu_ptr);
			
 
				+				multiformat_interface->cpu_ptr = NULL;
			
 
				+			}
			
 
				+			if (multiformat_interface->cuda_ptr)
			
 
				+			{
			
 
				+				cudaFree(multiformat_interface->cuda_ptr);
			
 
				+				multiformat_interface->cuda_ptr = NULL;
			
 
				+			}
			
 
				+			break;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			/* TODO */
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *multiformat_interface;
			
 
				+	multiformat_interface = (struct starpu_multiformat_interface *) data_interface_;
			
 
				+	unsigned fail = 0;
			
 
				+	uintptr_t addr = 0;
			
 
				+	ssize_t allocated_memory;
			
 
				+
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				+	switch(kind)
			
 
				+	{
			
 
				+		case STARPU_CPU_RAM:
			
 
				+			allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
			
 
				+			addr = (uintptr_t)malloc(allocated_memory);
			
 
				+			if (!addr)
			
 
				+			{
			
 
				+				fail = 1;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				multiformat_interface->cpu_ptr = (void *) addr;
			
 
				+			}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			multiformat_interface->cuda_ptr = malloc(multiformat_interface->nx * multiformat_interface->ops->cuda_elemsize);
			
 
				+			STARPU_ASSERT(multiformat_interface->cuda_ptr != NULL);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			multiformat_interface->opencl_ptr = malloc(multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize);
			
 
				+			STARPU_ASSERT(multiformat_interface->opencl_ptr != NULL);
			
 
				+#endif
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+			{
			
 
				+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cuda_elemsize;
			
 
				+				cudaError_t status = cudaMalloc((void **)&addr, allocated_memory);
			
 
				+				if (STARPU_UNLIKELY(status))
			
 
				+				{
			
 
				+					STARPU_CUDA_REPORT_ERROR(status);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					multiformat_interface->cuda_ptr = (void *)addr;
			
 
				+				}
			
 
				+
			
 
				+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
			
 
				+				status = cudaMalloc((void **)&multiformat_interface->cpu_ptr, allocated_memory);
			
 
				+				if (STARPU_UNLIKELY(status != cudaSuccess))
			
 
				+					STARPU_CUDA_REPORT_ERROR(status);
			
 
				+				break;
			
 
				+			}
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			{
			
 
				+                                int ret;
			
 
				+				cl_mem ptr;
			
 
				+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize;
			
 
				+                                ret = starpu_opencl_allocate_memory(&ptr, allocated_memory, CL_MEM_READ_WRITE);
			
 
				+                                addr = (uintptr_t)ptr;
			
 
				+				if (ret)
			
 
				+				{
			
 
				+					fail = 1;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					multiformat_interface->opencl_ptr = (void *)addr;
			
 
				+
			
 
				+				}
			
 
				+
			
 
				+				ret = starpu_opencl_allocate_memory(&ptr,
			
 
				+							multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize,
			
 
				+							CL_MEM_READ_WRITE);
			
 
				+				addr = (uintptr_t)ptr;
			
 
				+				if (ret)
			
 
				+				{
			
 
				+					fail = 1;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					multiformat_interface->cpu_ptr = (void *) addr;
			
 
				+				}
			
 
				+				
			
 
				+				break;
			
 
				+			}
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ASSERT(0);
			
 
				+	}
			
 
				+
			
 
				+	if (fail)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	return allocated_memory;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Copy methods
			
 
				+ */
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__ ((unused)),
			
 
				+			   void *dst_interface, unsigned dst_node __attribute__ ((unused)))
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *src_multiformat;
			
 
				+	struct starpu_multiformat_interface *dst_multiformat;
			
 
				+
			
 
				+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
			
 
				+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
			
 
				+
			
 
				+	STARPU_ASSERT(src_multiformat != NULL);
			
 
				+	STARPU_ASSERT(dst_multiformat != NULL);
			
 
				+	STARPU_ASSERT(dst_multiformat->ops != NULL);
			
 
				+
			
 
				+	size_t size = dst_multiformat->nx * dst_multiformat->ops->cpu_elemsize;
			
 
				+	memcpy(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__ ((unused)),
			
 
				+			    void *dst_interface, unsigned dst_node __attribute__ ((unused)),
			
 
				+			    enum cudaMemcpyKind kind)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *src_multiformat;
			
 
				+	struct starpu_multiformat_interface *dst_multiformat;
			
 
				+
			
 
				+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
			
 
				+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
			
 
				+
			
 
				+	size_t size;
			
 
				+
			
 
				+	cudaError_t status;
			
 
				+
			
 
				+	switch (kind)
			
 
				+	{
			
 
				+		case cudaMemcpyHostToDevice:
			
 
				+		{
			
 
				+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				+			if (src_multiformat->cuda_ptr == NULL)
			
 
				+			{
			
 
				+				src_multiformat->cuda_ptr = malloc(size);
			
 
				+				if (src_multiformat->cuda_ptr == NULL)
			
 
				+					return -ENOMEM;
			
 
				+			}
			
 
				+			status = cudaMemcpy(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size, kind);
			
 
				+			if (STARPU_UNLIKELY(status))
			
 
				+			{
			
 
				+				STARPU_CUDA_REPORT_ERROR(status);
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+		case cudaMemcpyDeviceToHost:
			
 
				+		{
			
 
				+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
			
 
				+			if (STARPU_UNLIKELY(status))
			
 
				+				STARPU_CUDA_REPORT_ERROR(status);
			
 
				+
			
 
				+			break;
			
 
				+		}
			
 
				+		case cudaMemcpyDeviceToDevice:
			
 
				+		{
			
 
				+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
			
 
				+			if (STARPU_UNLIKELY(status))
			
 
				+				STARPU_CUDA_REPORT_ERROR(status);
			
 
				+			break;
			
 
				+		}
			
 
				+		default:
			
 
				+			STARPU_ASSERT(0);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_common_async(void *src_interface, unsigned src_node __attribute__ ((unused)),
			
 
				+				  void *dst_interface, unsigned dst_node __attribute__ ((unused)),
			
 
				+				  cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *src_multiformat;
			
 
				+	struct starpu_multiformat_interface *dst_multiformat;
			
 
				+
			
 
				+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
			
 
				+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
			
 
				+
			
 
				+	size_t size;
			
 
				+	cudaError_t status;
			
 
				+
			
 
				+	switch (kind)
			
 
				+	{
			
 
				+		case cudaMemcpyHostToDevice:
			
 
				+		{
			
 
				+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				+			if (src_multiformat->cuda_ptr == NULL)
			
 
				+			{
			
 
				+				src_multiformat->cuda_ptr = malloc(size);
			
 
				+				if (src_multiformat->cuda_ptr == NULL)
			
 
				+					return -ENOMEM;
			
 
				+			}
			
 
				+
			
 
				+			status = cudaMemcpyAsync(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size, kind, stream);
			
 
				+			if (STARPU_UNLIKELY(status))
			
 
				+			{
			
 
				+				STARPU_CUDA_REPORT_ERROR(status);
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+		case cudaMemcpyDeviceToHost:
			
 
				+		{
			
 
				+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
			
 
				+			if (STARPU_UNLIKELY(status))
			
 
				+				STARPU_CUDA_REPORT_ERROR(status);
			
 
				+
			
 
				+			break;
			
 
				+		}
			
 
				+		case cudaMemcpyDeviceToDevice:
			
 
				+		{
			
 
				+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				+			status = cudaMemcpyAsync(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind, stream);
			
 
				+			if (STARPU_UNLIKELY(status))
			
 
				+				STARPU_CUDA_REPORT_ERROR(status);
			
 
				+			break;
			
 
				+		}
			
 
				+		default:
			
 
				+			STARPU_ASSERT(0);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				+
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
			
 
				+				void *dst_interface, unsigned dst_node,
			
 
				+				cudaStream_t stream)
			
 
				+{
			
 
				+	struct starpu_multiformat_interface *src_multiformat;
			
 
				+	struct starpu_multiformat_interface *dst_multiformat;
			
 
				+
			
 
				+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
			
 
				+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
			
 
				+
			
 
				+	STARPU_ASSERT(src_multiformat != NULL);
			
 
				+	STARPU_ASSERT(dst_multiformat != NULL);
			
 
				+	STARPU_ASSERT(src_multiformat->ops != NULL);
			
 
				+
			
 
				+	cudaError_t status;
			
 
				+	int size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				+	int src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+	if (stream)
			
 
				+	{
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+		status = cudaMemcpyPeerAsync(dst_multiformat->cuda_ptr, dst_dev,
			
 
				+					     src_multiformat->cuda_ptr, src_dev,
			
 
				+					     size, stream);
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+		/* All good ! Still, returning -EAGAIN, because we will need to
			
 
				+                   check the transfert completion later */
			
 
				+		if (status == cudaSuccess)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	/* Either a synchronous transfert was requested, or the asynchronous one
			
 
				+           failed. */
			
 
				+	status = cudaMemcpyPeer(dst_multiformat->cuda_ptr, dst_dev,
			
 
				+				src_multiformat->cuda_ptr, src_dev,
			
 
				+				size);
			
 
				+	if (STARPU_UNLIKELY(status != cudaSuccess))
			
 
				+		STARPU_CUDA_REPORT_ERROR(status);
			
 
				+
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		return copy_cuda_peer_common(src_interface, src_node,
			
 
				+					     dst_interface, dst_node,
			
 
				+					     NULL);
			
 
				+#else
			
 
				+		STARPU_ASSERT(0);
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				+				   void *dst_interface, unsigned dst_node,
			
 
				+				   cudaStream_t stream)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_common_async(src_interface, src_node,
			
 
				+					      dst_interface, dst_node,
			
 
				+					      stream, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		return copy_cuda_peer_common(src_interface, src_node,
			
 
				+					     dst_interface, dst_node,
			
 
				+					     stream);
			
 
				+#else
			
 
				+		STARPU_ASSERT(0);
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *_event)
			
 
				+{
			
 
				+	int err, ret;
			
 
				+	size_t size;
			
 
				+	struct starpu_multiformat_interface *src_multiformat;
			
 
				+	struct starpu_multiformat_interface *dst_multiformat;
			
 
				+
			
 
				+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
			
 
				+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
			
 
				+
			
 
				+	STARPU_ASSERT(src_multiformat != NULL);
			
 
				+	STARPU_ASSERT(dst_multiformat != NULL);
			
 
				+	STARPU_ASSERT(src_multiformat->ops != NULL);
			
 
				+
			
 
				+	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize;
			
 
				+
			
 
				+
			
 
				+	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_multiformat->cpu_ptr,
			
 
				+							   src_node,
			
 
				+							   (cl_mem) dst_multiformat->cpu_ptr,
			
 
				+							   dst_node,
			
 
				+							   size,
			
 
				+							   0,
			
 
				+							   (cl_event *) _event,
			
 
				+							   &ret);
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *_event)
			
 
				+{
			
 
				+	int err, ret;
			
 
				+	size_t size;
			
 
				+	struct starpu_multiformat_interface *src_multiformat;
			
 
				+	struct starpu_multiformat_interface *dst_multiformat;
			
 
				+
			
 
				+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
			
 
				+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
			
 
				+
			
 
				+	STARPU_ASSERT(src_multiformat != NULL);
			
 
				+	STARPU_ASSERT(dst_multiformat != NULL);
			
 
				+	STARPU_ASSERT(src_multiformat->ops != NULL);
			
 
				+	STARPU_ASSERT(dst_multiformat->ops != NULL);
			
 
				+
			
 
				+	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize;
			
 
				+
			
 
				+	if (dst_multiformat->opencl_ptr == NULL) {
			
 
				+		/* XXX : it is weird that we might have to allocate memory here... */
			
 
				+		dst_multiformat->opencl_ptr = malloc(dst_multiformat->nx * dst_multiformat->ops->opencl_elemsize);
			
 
				+	}
			
 
				+	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_multiformat->opencl_ptr,
			
 
				+							   src_node,
			
 
				+							   dst_multiformat->opencl_ptr,
			
 
				+							   dst_node,
			
 
				+							   size,
			
 
				+                                                           0,
			
 
				+							   (cl_event *)_event,
			
 
				+							   &ret);
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
			
 
				+
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				+                                 void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	(void) src_interface;
			
 
				+	(void) dst_interface;
			
 
				+	(void) src_node;
			
 
				+	(void) dst_node;
			
 
				+/* TODO */
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,7 +20,7 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/filters.h>
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				 #endif
			
 
				 
			
 
				-static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
			
 
				+static const struct starpu_data_copy_methods variable_copy_data_methods_s =
			
 
				+{
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 	.ram_to_spu = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -66,19 +67,20 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static void register_variable_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				 static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				-static void *variable_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static void *variable_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
			
 
				 static void free_variable_buffer_on_node(void *data_interface, uint32_t node);
			
 
				-static size_t variable_interface_get_size(starpu_data_handle handle);
			
 
				-static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle);
			
 
				+static size_t variable_interface_get_size(starpu_data_handle_t handle);
			
 
				+static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle);
			
 
				 static int variable_compare(void *data_interface_a, void *data_interface_b);
			
 
				-static void display_variable_interface(starpu_data_handle handle, FILE *f);
			
 
				+static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+static int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_data_interface_ops_t interface_variable_ops = {
			
 
				+static struct starpu_data_interface_ops interface_variable_ops =
			
 
				+{
			
 
				 	.register_data_handle = register_variable_handle,
			
 
				 	.allocate_data_on_node = allocate_variable_buffer_on_node,
			
 
				 	.handle_to_pointer = variable_handle_to_pointer,
			
@@ -91,29 +93,31 @@ static struct starpu_data_interface_ops_t interface_variable_ops = {
 
				 	.convert_to_gordon = convert_variable_to_gordon,
			
 
				 #endif
			
 
				 	.interfaceid = STARPU_VARIABLE_INTERFACE_ID,
			
 
				-	.interface_size = sizeof(starpu_variable_interface_t), 
			
 
				-	.display = display_variable_interface
			
 
				+	.interface_size = sizeof(struct starpu_variable_interface),
			
 
				+	.display = display_variable_interface,
			
 
				 };
			
 
				 
			
 
				-static void *variable_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+static void *variable_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				 	return (void*) STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node));
			
 
				 }
			
 
				 
			
 
				-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+static void register_variable_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		starpu_variable_interface_t *local_interface = (starpu_variable_interface_t *)
			
 
				+		struct starpu_variable_interface *local_interface = (struct starpu_variable_interface *)
			
 
				 			starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-		if (node == home_node) {
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				 			local_interface->ptr = STARPU_VARIABLE_GET_PTR(data_interface);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			local_interface->ptr = 0;
			
 
				 		}
			
 
				 
			
@@ -122,7 +126,7 @@ static void register_variable_handle(starpu_data_handle handle, uint32_t home_no
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
			
 
				 {
			
 
				 	*ptr = STARPU_VARIABLE_GET_PTR(interface);
			
 
				 	(*ss).size = STARPU_VARIABLE_GET_ELEMSIZE(interface);
			
@@ -132,49 +136,50 @@ int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strid
 
				 #endif
			
 
				 
			
 
				 /* declare a new data with the variable interface */
			
 
				-void starpu_variable_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				+void starpu_variable_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
			
 
				                         uintptr_t ptr, size_t elemsize)
			
 
				 {
			
 
				-	starpu_variable_interface_t variable = {
			
 
				+	struct starpu_variable_interface variable =
			
 
				+	{
			
 
				 		.ptr = ptr,
			
 
				 		.elemsize = elemsize
			
 
				-	};	
			
 
				+	};
			
 
				 
			
 
				-	starpu_data_register(handleptr, home_node, &variable, &interface_variable_ops); 
			
 
				+	starpu_data_register(handleptr, home_node, &variable, &interface_variable_ops);
			
 
				 }
			
 
				 
			
 
				 
			
 
				-static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle)
			
 
				+static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	return _starpu_crc32_be(starpu_variable_get_elemsize(handle), 0);
			
 
				+	return starpu_crc32_be(starpu_variable_get_elemsize(handle), 0);
			
 
				 }
			
 
				 
			
 
				 static int variable_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_variable_interface_t *variable_a = (starpu_variable_interface_t *) data_interface_a;
			
 
				-	starpu_variable_interface_t *variable_b = (starpu_variable_interface_t *) data_interface_b;
			
 
				+	struct starpu_variable_interface *variable_a = (struct starpu_variable_interface *) data_interface_a;
			
 
				+	struct starpu_variable_interface *variable_b = (struct starpu_variable_interface *) data_interface_b;
			
 
				 
			
 
				 	/* Two variables are considered compatible if they have the same size */
			
 
				 	return (variable_a->elemsize == variable_b->elemsize);
			
 
				-} 
			
 
				+}
			
 
				 
			
 
				-static void display_variable_interface(starpu_data_handle handle, FILE *f)
			
 
				+static void display_variable_interface(starpu_data_handle_t handle, FILE *f)
			
 
				 {
			
 
				-	starpu_variable_interface_t *variable_interface = (starpu_variable_interface_t *)
			
 
				+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
			
 
				 }
			
 
				 
			
 
				-static size_t variable_interface_get_size(starpu_data_handle handle)
			
 
				+static size_t variable_interface_get_size(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_variable_interface_t *variable_interface = (starpu_variable_interface_t *)
			
 
				+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return variable_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				-uintptr_t starpu_variable_get_local_ptr(starpu_data_handle handle)
			
 
				+uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
@@ -184,7 +189,7 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle handle)
 
				 	return STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node));
			
 
				 }
			
 
				 
			
 
				-size_t starpu_variable_get_elemsize(starpu_data_handle handle)
			
 
				+size_t starpu_variable_get_elemsize(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return STARPU_VARIABLE_GET_ELEMSIZE(starpu_data_get_interface_on_node(handle, 0));
			
 
				 }
			
@@ -194,7 +199,7 @@ size_t starpu_variable_get_elemsize(starpu_data_handle handle)
 
				 /* returns the size of the allocated area */
			
 
				 static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				-	starpu_variable_interface_t *variable_interface = (starpu_variable_interface_t *) data_interface_;
			
 
				+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *) data_interface_;
			
 
				 
			
 
				 	unsigned fail = 0;
			
 
				 	uintptr_t addr = 0;
			
@@ -202,13 +207,14 @@ static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t
 
				 
			
 
				 	size_t elemsize = variable_interface->elemsize;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				 
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			addr = (uintptr_t)malloc(elemsize);
			
 
				 			if (!addr)
			
@@ -230,17 +236,18 @@ static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t
 
				 	        case STARPU_OPENCL_RAM:
			
 
				 			{
			
 
				                                 int ret;
			
 
				-                                void *ptr;
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
			
 
				+				cl_mem ptr;
			
 
				+                                ret = starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
			
 
				                                 addr = (uintptr_t)ptr;
			
 
				-				if (ret) {
			
 
				+				if (ret)
			
 
				+				{
			
 
				 					fail = 1;
			
 
				 				}
			
 
				 				break;
			
 
				 			}
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 	if (fail)
			
@@ -251,14 +258,15 @@ static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				 	variable_interface->ptr = addr;
			
 
				-	
			
 
				+
			
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				 static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				-	switch(kind) {
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(node);
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void*)STARPU_VARIABLE_GET_PTR(data_interface));
			
 
				 			break;
			
@@ -273,7 +281,7 @@ static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
 
				                         break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -281,8 +289,8 @@ static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
 
				 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_variable_interface_t *src_variable = src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+	struct starpu_variable_interface *src_variable = src_interface;
			
 
				+	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				 	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
			
@@ -290,7 +298,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -315,20 +323,21 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 
				 	{
			
 
				 		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-		int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				-		int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+		int src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+		int dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				 
			
 
				-		starpu_variable_interface_t *src_variable = src_interface;
			
 
				-		starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+		struct starpu_variable_interface *src_variable = src_interface;
			
 
				+		struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				 
			
 
				 		cudaError_t cures;
			
 
				 		cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, src_variable->elemsize);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-		STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				 
			
 
				 #else
			
 
				 		/* This is illegal without support for cudaMemcpyPeer */
			
@@ -342,11 +351,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 					cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_variable_interface_t *src_variable = src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+	struct starpu_variable_interface *src_variable = src_interface;
			
 
				+	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
			
 
				+	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
@@ -358,7 +369,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				 
			
 
				 	return -EAGAIN;
			
 
				 }
			
@@ -382,18 +393,21 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 
				 	{
			
 
				 		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-		int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				-		int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+		int src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+		int dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				 
			
 
				-		starpu_variable_interface_t *src_variable = src_interface;
			
 
				-		starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+		struct starpu_variable_interface *src_variable = src_interface;
			
 
				+		struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				 
			
 
				 		size_t length = src_variable->elemsize;
			
 
				 
			
 
				 		cudaError_t cures;
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpyPeerAsync((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length, stream);
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		if (cures)
			
 
				 		{
			
 
				 			/* sychronous fallback */
			
@@ -404,7 +418,7 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 
				 			return 0;
			
 
				 		}
			
 
				 
			
 
				-		STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				+		_STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				 
			
 
				 		return -EAGAIN;
			
 
				 #else
			
@@ -422,33 +436,33 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 
				 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface,
			
 
				                                     unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_variable_interface_t *src_variable = src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+	struct starpu_variable_interface *src_variable = src_interface;
			
 
				+	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				         int err,ret;
			
 
				 
			
 
				-        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
			
 
				+        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, src_node, (cl_mem)dst_variable->ptr, dst_node, src_variable->elemsize,
			
 
				                                                            0, (cl_event*)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_variable_interface_t *src_variable = src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+	struct starpu_variable_interface *src_variable = src_interface;
			
 
				+	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				-	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
			
 
				+	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, src_node, (void*)dst_variable->ptr, dst_node, src_variable->elemsize,
			
 
				                                                            0, (cl_event*)_event, &ret);
			
 
				 
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -467,21 +481,30 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
				 {
			
 
				 	cl_int err;
			
 
				 
			
 
				-	starpu_variable_interface_t *src_variable = src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+	struct starpu_variable_interface *src_variable = src_interface;
			
 
				+	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				 
			
 
				 	cl_mem src_ptr = (cl_mem)src_variable->ptr;
			
 
				 	cl_mem dst_ptr = (cl_mem)dst_variable->ptr;
			
 
				 
			
 
				 	cl_command_queue cq;
			
 
				 	starpu_opencl_get_current_queue(&cq);
			
 
				+	cl_event event;
			
 
				 
			
 
				 	STARPU_ASSERT(src_variable->elemsize == dst_variable->elemsize);
			
 
				-	err= clEnqueueCopyBuffer(cq, src_ptr, dst_ptr, 0, 0, src_variable->elemsize, 0, NULL, NULL);
			
 
				+	err= clEnqueueCopyBuffer(cq, src_ptr, dst_ptr, 0, 0, src_variable->elemsize, 0, NULL, &event);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clWaitForEvents(1, &event);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clReleaseEvent(event);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -490,8 +513,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
				 
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_variable_interface_t *src_variable = (starpu_variable_interface_t *) src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = (starpu_variable_interface_t *) dst_interface;
			
 
				+	struct starpu_variable_interface *src_variable = (struct starpu_variable_interface *) src_interface;
			
 
				+	struct starpu_variable_interface *dst_variable = (struct starpu_variable_interface *) dst_interface;
			
 
				 
			
 
				 	size_t elemsize = dst_variable->elemsize;
			
 
				 
			
@@ -500,7 +523,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 
			
 
				 	memcpy((void *)ptr_dst, (void *)ptr_src, elemsize);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/vector_filters.c
+++ b/src/datawizard/interfaces/vector_filters.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -22,9 +22,9 @@
 
				 
			
 
				 void starpu_block_filter_func_vector(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				-        starpu_vector_interface_t *vector_father = (starpu_vector_interface_t *) father_interface;
			
 
				-        starpu_vector_interface_t *vector_child = (starpu_vector_interface_t *) child_interface;
			
 
				-	
			
 
				+        struct starpu_vector_interface *vector_father = (struct starpu_vector_interface *) father_interface;
			
 
				+        struct starpu_vector_interface *vector_child = (struct starpu_vector_interface *) child_interface;
			
 
				+
			
 
				 	uint32_t nx = vector_father->nx;
			
 
				 	size_t elemsize = vector_father->elemsize;
			
 
				 
			
@@ -33,16 +33,18 @@ void starpu_block_filter_func_vector(void *father_interface, void *child_interfa
 
				 	uint32_t chunk_size = (nx + nchunks - 1)/nchunks;
			
 
				 	size_t offset = id*chunk_size*elemsize;
			
 
				 
			
 
				-	uint32_t child_nx = 
			
 
				+	uint32_t child_nx =
			
 
				 	  STARPU_MIN(chunk_size, nx - id*chunk_size);
			
 
				 
			
 
				 	vector_child->nx = child_nx;
			
 
				 	vector_child->elemsize = elemsize;
			
 
				 
			
 
				-	if (vector_father->ptr) {
			
 
				-	  vector_child->ptr = vector_father->ptr + offset;
			
 
				-	  vector_child->dev_handle = vector_father->dev_handle;
			
 
				-	  vector_child->offset = vector_father->offset + offset;
			
 
				+	if (vector_father->dev_handle)
			
 
				+	{
			
 
				+		if (vector_father->ptr)
			
 
				+			vector_child->ptr = vector_father->ptr + offset;
			
 
				+		vector_child->dev_handle = vector_father->dev_handle;
			
 
				+		vector_child->offset = vector_father->offset + offset;
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -51,9 +53,9 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 
				 {
			
 
				         /* there cannot be more than 2 chunks */
			
 
				         STARPU_ASSERT(id < 2);
			
 
				-	
			
 
				-	starpu_vector_interface_t *vector_father = (starpu_vector_interface_t *) father_interface;
			
 
				-	starpu_vector_interface_t *vector_child = (starpu_vector_interface_t *) child_interface;
			
 
				+
			
 
				+	struct starpu_vector_interface *vector_father = (struct starpu_vector_interface *) father_interface;
			
 
				+	struct starpu_vector_interface *vector_child = (struct starpu_vector_interface *) child_interface;
			
 
				 
			
 
				 	uint32_t length_first = f->filter_arg;
			
 
				 
			
@@ -61,37 +63,41 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 
				 	size_t elemsize = vector_father->elemsize;
			
 
				 
			
 
				 	STARPU_ASSERT(length_first < nx);
			
 
				-	
			
 
				+
			
 
				 	/* this is the first child */
			
 
				-	if (id == 0) {
			
 
				-	  vector_child->nx = length_first;
			
 
				-	  vector_child->elemsize = elemsize;
			
 
				-
			
 
				-	  if (vector_father->ptr) {
			
 
				-	    vector_child->ptr = vector_father->ptr;
			
 
				-	    vector_child->offset = vector_father->offset;
			
 
				-	    vector_child->dev_handle = vector_father->dev_handle;
			
 
				-	  }
			
 
				+	if (id == 0)
			
 
				+	{
			
 
				+		vector_child->nx = length_first;
			
 
				+		vector_child->elemsize = elemsize;
			
 
				+
			
 
				+		if (vector_father->dev_handle)
			
 
				+		{
			
 
				+			if (vector_father->ptr)
			
 
				+				vector_child->ptr = vector_father->ptr;
			
 
				+			vector_child->offset = vector_father->offset;
			
 
				+			vector_child->dev_handle = vector_father->dev_handle;
			
 
				+		}
			
 
				 	}
			
 
				-
			
 
				-	/* the second child */
			
 
				-	else {
			
 
				-	  vector_child->nx = nx - length_first;
			
 
				-	  vector_child->elemsize = elemsize;
			
 
				-
			
 
				-	  if (vector_father->ptr) {
			
 
				-	    vector_child->ptr = vector_father->ptr + length_first*elemsize;
			
 
				-	    vector_child->offset = vector_father->offset + length_first*elemsize;
			
 
				-	    vector_child->dev_handle = vector_father->dev_handle;
			
 
				-	  }
			
 
				+	else /* the second child */
			
 
				+	{
			
 
				+		vector_child->nx = nx - length_first;
			
 
				+		vector_child->elemsize = elemsize;
			
 
				+
			
 
				+		if (vector_father->dev_handle)
			
 
				+		{
			
 
				+			if (vector_father->ptr)
			
 
				+				vector_child->ptr = vector_father->ptr + length_first*elemsize;
			
 
				+			vector_child->offset = vector_father->offset + length_first*elemsize;
			
 
				+			vector_child->dev_handle = vector_father->dev_handle;
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 
			
 
				 void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nchunks)
			
 
				 {
			
 
				-        starpu_vector_interface_t *vector_father = (starpu_vector_interface_t *) father_interface;
			
 
				-        starpu_vector_interface_t *vector_child = (starpu_vector_interface_t *) child_interface;
			
 
				+        struct starpu_vector_interface *vector_father = (struct starpu_vector_interface *) father_interface;
			
 
				+        struct starpu_vector_interface *vector_child = (struct starpu_vector_interface *) child_interface;
			
 
				 
			
 
				         uint32_t *length_tab = (uint32_t *) f->filter_arg_ptr;
			
 
				 
			
@@ -103,15 +109,17 @@ void starpu_vector_list_filter_func(void *father_interface, void *child_interfac
 
				 
			
 
				 	vector_child->nx = chunk_size;
			
 
				 	vector_child->elemsize = elemsize;
			
 
				-	
			
 
				-	if (vector_father->ptr) {
			
 
				-	  /* compute the current position */
			
 
				-	  unsigned i;
			
 
				-	  for (i = 0; i < id; i++) 
			
 
				-	    current_pos += length_tab[i];
			
 
				-	  
			
 
				-	  vector_child->ptr = vector_father->ptr + current_pos*elemsize;
			
 
				-	  vector_child->offset = vector_father->offset + current_pos*elemsize;
			
 
				-	  vector_child->dev_handle = vector_father->dev_handle;
			
 
				+
			
 
				+	if (vector_father->dev_handle)
			
 
				+	{
			
 
				+		/* compute the current position */
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < id; i++)
			
 
				+			current_pos += length_tab[i];
			
 
				+
			
 
				+		if (vector_father->ptr)
			
 
				+			vector_child->ptr = vector_father->ptr + current_pos*elemsize;
			
 
				+		vector_child->offset = vector_father->offset + current_pos*elemsize;
			
 
				+		vector_child->dev_handle = vector_father->dev_handle;
			
 
				 	}
			
 
				 }
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,7 +20,7 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/filters.h>
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
@@ -42,7 +42,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
			
 
				 #endif
			
 
				 
			
 
				-static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
			
 
				+static const struct starpu_data_copy_methods vector_copy_data_methods_s =
			
 
				+{
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 	.ram_to_spu = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -66,19 +67,20 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static void register_vector_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				 static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				-static void *vector_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static void *vector_handle_to_pointer(starpu_data_handle_t data_handle, uint32_t node);
			
 
				 static void free_vector_buffer_on_node(void *data_interface, uint32_t node);
			
 
				-static size_t vector_interface_get_size(starpu_data_handle handle);
			
 
				-static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle);
			
 
				+static size_t vector_interface_get_size(starpu_data_handle_t handle);
			
 
				+static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle);
			
 
				 static int vector_compare(void *data_interface_a, void *data_interface_b);
			
 
				-static void display_vector_interface(starpu_data_handle handle, FILE *f);
			
 
				+static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+static int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_data_interface_ops_t interface_vector_ops = {
			
 
				+static struct starpu_data_interface_ops interface_vector_ops =
			
 
				+{
			
 
				 	.register_data_handle = register_vector_handle,
			
 
				 	.allocate_data_on_node = allocate_vector_buffer_on_node,
			
 
				 	.handle_to_pointer = vector_handle_to_pointer,
			
@@ -91,36 +93,38 @@ static struct starpu_data_interface_ops_t interface_vector_ops = {
 
				 	.convert_to_gordon = convert_vector_to_gordon,
			
 
				 #endif
			
 
				 	.interfaceid = STARPU_VECTOR_INTERFACE_ID,
			
 
				-	.interface_size = sizeof(starpu_vector_interface_t), 
			
 
				-	.display = display_vector_interface
			
 
				+	.interface_size = sizeof(struct starpu_vector_interface),
			
 
				+	.display = display_vector_interface,
			
 
				 };
			
 
				 
			
 
				-static void *vector_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+static void *vector_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return (void*) vector_interface->ptr;
			
 
				 }
			
 
				 
			
 
				-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+static void register_vector_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *) data_interface;
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *) data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		starpu_vector_interface_t *local_interface = (starpu_vector_interface_t *)
			
 
				+		struct starpu_vector_interface *local_interface = (struct starpu_vector_interface *)
			
 
				 			starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-		if (node == home_node) {
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				 			local_interface->ptr = vector_interface->ptr;
			
 
				                         local_interface->dev_handle = vector_interface->dev_handle;
			
 
				                         local_interface->offset = vector_interface->offset;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			local_interface->ptr = 0;
			
 
				                         local_interface->dev_handle = 0;
			
 
				                         local_interface->offset = 0;
			
@@ -132,10 +136,10 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = interface;
			
 
				-	
			
 
				+	struct starpu_vector_interface *vector_interface = interface;
			
 
				+
			
 
				 	*ptr = vector_interface->ptr;
			
 
				 	(*ss).size = vector_interface->nx * vector_interface->elemsize;
			
 
				 
			
@@ -144,48 +148,49 @@ int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideS
 
				 #endif
			
 
				 
			
 
				 /* declare a new data with the vector interface */
			
 
				-void starpu_vector_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				+void starpu_vector_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
			
 
				                         uintptr_t ptr, uint32_t nx, size_t elemsize)
			
 
				 {
			
 
				-	starpu_vector_interface_t vector = {
			
 
				+	struct starpu_vector_interface vector =
			
 
				+	{
			
 
				 		.ptr = ptr,
			
 
				 		.nx = nx,
			
 
				 		.elemsize = elemsize,
			
 
				                 .dev_handle = ptr,
			
 
				                 .offset = 0
			
 
				-	};	
			
 
				+	};
			
 
				 
			
 
				-	starpu_data_register(handleptr, home_node, &vector, &interface_vector_ops); 
			
 
				+	starpu_data_register(handleptr, home_node, &vector, &interface_vector_ops);
			
 
				 }
			
 
				 
			
 
				 
			
 
				-static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle)
			
 
				+static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	return _starpu_crc32_be(starpu_vector_get_nx(handle), 0);
			
 
				+	return starpu_crc32_be(starpu_vector_get_nx(handle), 0);
			
 
				 }
			
 
				 
			
 
				 static int vector_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_a = (starpu_vector_interface_t *) data_interface_a;
			
 
				-	starpu_vector_interface_t *vector_b = (starpu_vector_interface_t *) data_interface_b;
			
 
				+	struct starpu_vector_interface *vector_a = (struct starpu_vector_interface *) data_interface_a;
			
 
				+	struct starpu_vector_interface *vector_b = (struct starpu_vector_interface *) data_interface_b;
			
 
				 
			
 
				 	/* Two vectors are considered compatible if they have the same size */
			
 
				 	return ((vector_a->nx == vector_b->nx)
			
 
				 			&& (vector_a->elemsize == vector_b->elemsize));
			
 
				 }
			
 
				 
			
 
				-static void display_vector_interface(starpu_data_handle handle, FILE *f)
			
 
				+static void display_vector_interface(starpu_data_handle_t handle, FILE *f)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	fprintf(f, "%u\t", vector_interface->nx);
			
 
				 }
			
 
				 
			
 
				-static size_t vector_interface_get_size(starpu_data_handle handle)
			
 
				+static size_t vector_interface_get_size(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	size_t size;
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	size = vector_interface->nx*vector_interface->elemsize;
			
@@ -194,30 +199,30 @@ static size_t vector_interface_get_size(starpu_data_handle handle)
 
				 }
			
 
				 
			
 
				 /* offer an access to the data parameters */
			
 
				-uint32_t starpu_vector_get_nx(starpu_data_handle handle)
			
 
				+uint32_t starpu_vector_get_nx(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return vector_interface->nx;
			
 
				 }
			
 
				 
			
 
				-uintptr_t starpu_vector_get_local_ptr(starpu_data_handle handle)
			
 
				+uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	node = _starpu_get_local_memory_node();
			
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 	return vector_interface->ptr;
			
 
				 }
			
 
				 
			
 
				-size_t starpu_vector_get_elemsize(starpu_data_handle handle)
			
 
				+size_t starpu_vector_get_elemsize(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *)
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	return vector_interface->elemsize;
			
@@ -228,24 +233,25 @@ size_t starpu_vector_get_elemsize(starpu_data_handle handle)
 
				 /* returns the size of the allocated area */
			
 
				 static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *) data_interface_;
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *) data_interface_;
			
 
				 
			
 
				 	unsigned fail = 0;
			
 
				-	uintptr_t addr = 0;
			
 
				+	uintptr_t addr = 0, handle = 0;
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				 	uint32_t nx = vector_interface->nx;
			
 
				 	size_t elemsize = vector_interface->elemsize;
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				 
			
 
				-	switch(kind) {
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				-			addr = (uintptr_t)malloc(nx*elemsize);
			
 
				+			addr = handle = (uintptr_t)malloc(nx*elemsize);
			
 
				 			if (!addr)
			
 
				 				fail = 1;
			
 
				 			break;
			
@@ -259,23 +265,25 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 
				 
			
 
				 				fail = 1;
			
 
				 			}
			
 
				+			handle = addr;
			
 
				 			break;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	        case STARPU_OPENCL_RAM:
			
 
				 			{
			
 
				                                 int ret;
			
 
				-                                void *ptr;
			
 
				-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*elemsize, CL_MEM_READ_WRITE);
			
 
				-                                addr = (uintptr_t)ptr;
			
 
				-				if (ret) {
			
 
				+				cl_mem mem;
			
 
				+                                ret = starpu_opencl_allocate_memory(&mem, nx*elemsize, CL_MEM_READ_WRITE);
			
 
				+				handle = (uintptr_t)mem;
			
 
				+				if (ret)
			
 
				+				{
			
 
				 					fail = 1;
			
 
				 				}
			
 
				 				break;
			
 
				 			}
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 	if (fail)
			
@@ -286,22 +294,23 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				 	vector_interface->ptr = addr;
			
 
				-        vector_interface->dev_handle = addr;
			
 
				+	vector_interface->dev_handle = handle;
			
 
				         vector_interface->offset = 0;
			
 
				-	
			
 
				+
			
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				 static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = (starpu_vector_interface_t *) data_interface;
			
 
				+	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *) data_interface;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t cures;
			
 
				 #endif
			
 
				 
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				-	switch(kind) {
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(node);
			
 
				+	switch(kind)
			
 
				+	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void*)vector_interface->ptr);
			
 
				 			break;
			
@@ -313,11 +322,11 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                 case STARPU_OPENCL_RAM:
			
 
				-                        clReleaseMemObject((void *)vector_interface->ptr);
			
 
				+			clReleaseMemObject((cl_mem)vector_interface->dev_handle);
			
 
				                         break;
			
 
				 #endif
			
 
				 		default:
			
 
				-			assert(0);
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -325,8 +334,8 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 
				 static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+	struct starpu_vector_interface *src_vector = src_interface;
			
 
				+	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				 
			
@@ -334,7 +343,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -346,19 +355,21 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 
				 {
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+	struct starpu_vector_interface *src_vector = src_interface;
			
 
				+	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				 
			
 
				 	size_t length = src_vector->nx*src_vector->elemsize;
			
 
				 
			
 
				-	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				-	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+	int src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				 
			
 
				 	if (is_async)
			
 
				 	{
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		cures = cudaMemcpyPeerAsync((char *)dst_vector->ptr, dst_dev,
			
 
				 						(char *)src_vector->ptr, src_dev,
			
 
				 						length, stream);
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 		if (!cures)
			
 
				 			return -EAGAIN;
			
 
				 	}
			
@@ -368,7 +379,7 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -393,7 +404,8 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 
				 	{
			
 
				 		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 0, 0);
			
 
				 #else
			
@@ -408,12 +420,14 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 					cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+	struct starpu_vector_interface *src_vector = src_interface;
			
 
				+	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
			
 
				+	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
@@ -424,18 +438,19 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				 
			
 
				 	return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				 {
			
 
				 	if (src_node == dst_node)
			
 
				 	{
			
 
				 		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 1, stream);
			
 
				 #else
			
@@ -464,17 +479,17 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 
				 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				                                     void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+	struct starpu_vector_interface *src_vector = src_interface;
			
 
				+	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				-	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle,
			
 
				+	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, src_node, (cl_mem)dst_vector->dev_handle, dst_node,
			
 
				                                                            src_vector->nx*src_vector->elemsize,
			
 
				                                                            dst_vector->offset, (cl_event*)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -482,16 +497,16 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				                                     void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+	struct starpu_vector_interface *src_vector = src_interface;
			
 
				+	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				-	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
			
 
				+	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, src_node, (void*)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize,
			
 
				                                                            src_vector->offset, (cl_event *)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -513,19 +528,28 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
				 {
			
 
				         int err;
			
 
				 
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+	struct starpu_vector_interface *src_vector = src_interface;
			
 
				+	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				 
			
 
				 	cl_command_queue cq;
			
 
				 	starpu_opencl_get_current_queue(&cq);
			
 
				 
			
 
				 	size_t size = src_vector->nx*src_vector->elemsize;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	err = clEnqueueCopyBuffer(cq, (cl_mem)src_vector->dev_handle, (cl_mem)dst_vector->dev_handle, src_vector->offset, dst_vector->offset, size, 0, NULL, &event);
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clWaitForEvents(1, &event);
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = clEnqueueCopyBuffer(cq, (cl_mem)src_vector->dev_handle, (cl_mem)dst_vector->dev_handle, src_vector->offset, dst_vector->offset, size, 0, NULL, NULL); 
			
 
				+	err = clReleaseEvent(event);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -536,8 +560,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_vector_interface_t *src_vector = (starpu_vector_interface_t *) src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = (starpu_vector_interface_t *) dst_interface;
			
 
				+	struct starpu_vector_interface *src_vector = (struct starpu_vector_interface *) src_interface;
			
 
				+	struct starpu_vector_interface *dst_vector = (struct starpu_vector_interface *) dst_interface;
			
 
				 
			
 
				 	uint32_t nx = dst_vector->nx;
			
 
				 	size_t elemsize = dst_vector->elemsize;
			
@@ -547,7 +571,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 
			
 
				 	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
			
 
				+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -20,7 +20,7 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/filters.h>
			
 
				-#include <common/hash.h>
			
 
				+#include <starpu_hash.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
@@ -33,7 +33,8 @@ static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *d
 
				 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
			
 
				 #endif
			
 
				 
			
 
				-static const struct starpu_data_copy_methods void_copy_data_methods_s = {
			
 
				+static const struct starpu_data_copy_methods void_copy_data_methods_s =
			
 
				+{
			
 
				 	.ram_to_ram = dummy_copy,
			
 
				 	.ram_to_spu = dummy_copy,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -56,15 +57,16 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 
				 	.spu_to_spu = dummy_copy
			
 
				 };
			
 
				 
			
 
				-static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static void register_void_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface);
			
 
				 static ssize_t allocate_void_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				 static void free_void_buffer_on_node(void *data_interface, uint32_t node);
			
 
				-static size_t void_interface_get_size(starpu_data_handle handle);
			
 
				-static uint32_t footprint_void_interface_crc32(starpu_data_handle handle);
			
 
				+static size_t void_interface_get_size(starpu_data_handle_t handle);
			
 
				+static uint32_t footprint_void_interface_crc32(starpu_data_handle_t handle);
			
 
				 static int void_compare(void *data_interface_a, void *data_interface_b);
			
 
				-static void display_void_interface(starpu_data_handle handle, FILE *f);
			
 
				+static void display_void_interface(starpu_data_handle_t handle, FILE *f);
			
 
				 
			
 
				-static struct starpu_data_interface_ops_t interface_void_ops = {
			
 
				+static struct starpu_data_interface_ops interface_void_ops =
			
 
				+{
			
 
				 	.register_data_handle = register_void_handle,
			
 
				 	.allocate_data_on_node = allocate_void_buffer_on_node,
			
 
				 	.free_data_on_node = free_void_buffer_on_node,
			
@@ -73,11 +75,11 @@ static struct starpu_data_interface_ops_t interface_void_ops = {
 
				 	.footprint = footprint_void_interface_crc32,
			
 
				 	.compare = void_compare,
			
 
				 	.interfaceid = STARPU_VOID_INTERFACE_ID,
			
 
				-	.interface_size = 0, 
			
 
				+	.interface_size = 0,
			
 
				 	.display = display_void_interface
			
 
				 };
			
 
				 
			
 
				-static void register_void_handle(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED,
			
 
				+static void register_void_handle(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED,
			
 
				 				uint32_t home_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 				void *data_interface STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
@@ -85,13 +87,13 @@ static void register_void_handle(starpu_data_handle handle STARPU_ATTRIBUTE_UNUS
 
				 }
			
 
				 
			
 
				 /* declare a new data with the void interface */
			
 
				-void starpu_void_data_register(starpu_data_handle *handleptr)
			
 
				+void starpu_void_data_register(starpu_data_handle_t *handleptr)
			
 
				 {
			
 
				-	starpu_data_register(handleptr, 0, NULL, &interface_void_ops); 
			
 
				+	starpu_data_register(handleptr, 0, NULL, &interface_void_ops);
			
 
				 }
			
 
				 
			
 
				 
			
 
				-static uint32_t footprint_void_interface_crc32(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
			
 
				+static uint32_t footprint_void_interface_crc32(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
@@ -104,12 +106,12 @@ static int void_compare(void *data_interface_a STARPU_ATTRIBUTE_UNUSED,
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static void display_void_interface(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED, FILE *f)
			
 
				+static void display_void_interface(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, FILE *f)
			
 
				 {
			
 
				 	fprintf(f, "void\t");
			
 
				 }
			
 
				 
			
 
				-static size_t void_interface_get_size(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
			
 
				+static size_t void_interface_get_size(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -17,24 +17,39 @@
 
				 
			
 
				 #include <datawizard/memalloc.h>
			
 
				 #include <datawizard/footprint.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 /* This per-node RW-locks protect mc_list and memchunk_cache entries */
			
 
				-static pthread_rwlock_t mc_rwlock[STARPU_MAXNODES]; 
			
 
				+static pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
			
 
				+
			
 
				+/* This per-node RW-locks protect lru_list */
			
 
				+static pthread_rwlock_t lru_rwlock[STARPU_MAXNODES];
			
 
				+
			
 
				+/* Last Recently used memory chunkgs */
			
 
				+static struct _starpu_mem_chunk_lru_list *starpu_lru_list[STARPU_MAXNODES];
			
 
				 
			
 
				 /* Potentially in use memory chunks */
			
 
				-static starpu_mem_chunk_list_t mc_list[STARPU_MAXNODES];
			
 
				+static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
			
 
				 
			
 
				 /* Explicitly caches memory chunks that can be reused */
			
 
				-static starpu_mem_chunk_list_t memchunk_cache[STARPU_MAXNODES];
			
 
				+static struct _starpu_mem_chunk_list *memchunk_cache[STARPU_MAXNODES];
			
 
				+
			
 
				+/* When reclaiming memory to allocate, we reclaim MAX(what_is_to_reclaim_on_device, data_size_coefficient*data_size) */
			
 
				+const unsigned starpu_memstrategy_data_size_coefficient=2;
			
 
				+
			
 
				+static void starpu_lru(unsigned node);
			
 
				 
			
 
				 void _starpu_init_mem_chunk_lists(void)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				 	{
			
 
				-		PTHREAD_RWLOCK_INIT(&mc_rwlock[i], NULL);
			
 
				-		mc_list[i] = starpu_mem_chunk_list_new();
			
 
				-		memchunk_cache[i] = starpu_mem_chunk_list_new();
			
 
				+		_STARPU_PTHREAD_RWLOCK_INIT(&mc_rwlock[i], NULL);
			
 
				+		_STARPU_PTHREAD_RWLOCK_INIT(&lru_rwlock[i], NULL);
			
 
				+		mc_list[i] = _starpu_mem_chunk_list_new();
			
 
				+		starpu_lru_list[i] = _starpu_mem_chunk_lru_list_new();
			
 
				+		memchunk_cache[i] = _starpu_mem_chunk_list_new();
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -43,8 +58,9 @@ void _starpu_deinit_mem_chunk_lists(void)
 
				 	unsigned i;
			
 
				 	for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				 	{
			
 
				-		starpu_mem_chunk_list_delete(mc_list[i]);
			
 
				-		starpu_mem_chunk_list_delete(memchunk_cache[i]);
			
 
				+		_starpu_mem_chunk_list_delete(mc_list[i]);
			
 
				+		_starpu_mem_chunk_list_delete(memchunk_cache[i]);
			
 
				+		_starpu_mem_chunk_lru_list_delete(starpu_lru_list[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -52,7 +68,7 @@ void _starpu_deinit_mem_chunk_lists(void)
 
				  *	Manipulate subtrees
			
 
				  */
			
 
				 
			
 
				-static void lock_all_subtree(starpu_data_handle handle)
			
 
				+static void lock_all_subtree(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	if (handle->nchildren == 0)
			
 
				 	{
			
@@ -60,7 +76,8 @@ static void lock_all_subtree(starpu_data_handle handle)
 
				 		while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 			_starpu_datawizard_progress(_starpu_get_local_memory_node(), 0);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* lock all sub-subtrees children */
			
 
				 		unsigned child;
			
 
				 		for (child = 0; child < handle->nchildren; child++)
			
@@ -70,15 +87,16 @@ static void lock_all_subtree(starpu_data_handle handle)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void unlock_all_subtree(starpu_data_handle handle)
			
 
				+static void unlock_all_subtree(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	if (handle->nchildren == 0)
			
 
				 	{
			
 
				-		/* this is a leaf */	
			
 
				+		/* this is a leaf */
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				-	else {
			
 
				-		/* lock all sub-subtrees children 
			
 
				+	else
			
 
				+	{
			
 
				+		/* lock all sub-subtrees children
			
 
				 		 * Note that this is done in the reverse order of the
			
 
				 		 * lock_all_subtree so that we avoid deadlock */
			
 
				 		unsigned i;
			
@@ -90,16 +108,16 @@ static void unlock_all_subtree(starpu_data_handle handle)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static unsigned may_free_subtree(starpu_data_handle handle, unsigned node)
			
 
				+static unsigned may_free_subtree(starpu_data_handle_t handle, unsigned node)
			
 
				 {
			
 
				 	/* we only free if no one refers to the leaf */
			
 
				 	uint32_t refcnt = _starpu_get_data_refcnt(handle, node);
			
 
				 	if (refcnt)
			
 
				 		return 0;
			
 
				-	
			
 
				+
			
 
				 	if (!handle->nchildren)
			
 
				 		return 1;
			
 
				-	
			
 
				+
			
 
				 	/* look into all sub-subtrees children */
			
 
				 	unsigned child;
			
 
				 	for (child = 0; child < handle->nchildren; child++)
			
@@ -113,8 +131,8 @@ static unsigned may_free_subtree(starpu_data_handle handle, unsigned node)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_node, 
			
 
				-						unsigned dst_node)
			
 
				+static void transfer_subtree_to_node(starpu_data_handle_t handle, unsigned src_node,
			
 
				+				     unsigned dst_node)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 	unsigned last = 0;
			
@@ -123,11 +141,12 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 
				 
			
 
				 	if (handle->nchildren == 0)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *src_replicate = &handle->per_node[src_node];
			
 
				-		struct starpu_data_replicate_s *dst_replicate = &handle->per_node[dst_node];
			
 
				+		struct _starpu_data_replicate *src_replicate = &handle->per_node[src_node];
			
 
				+		struct _starpu_data_replicate *dst_replicate = &handle->per_node[dst_node];
			
 
				 
			
 
				 		/* this is a leaf */
			
 
				-		switch(src_replicate->state) {
			
 
				+		switch(src_replicate->state)
			
 
				+		{
			
 
				 		case STARPU_OWNER:
			
 
				 			/* the local node has the only copy */
			
 
				 			/* the owner is now the destination_node */
			
@@ -138,14 +157,19 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 
				 #warning we should use requests during memory reclaim
			
 
				 #endif
			
 
				 			/* TODO use request !! */
			
 
				+			/* Take temporary references on the replicates */
			
 
				 			src_replicate->refcnt++;
			
 
				 			dst_replicate->refcnt++;
			
 
				+			handle->busy_count+=2;
			
 
				 
			
 
				 			ret = _starpu_driver_copy_data_1_to_1(handle, src_replicate, dst_replicate, 0, NULL, 1);
			
 
				 			STARPU_ASSERT(ret == 0);
			
 
				 
			
 
				 			src_replicate->refcnt--;
			
 
				 			dst_replicate->refcnt--;
			
 
				+			STARPU_ASSERT(handle->busy_count >= 2);
			
 
				+			handle->busy_count -= 2;
			
 
				+			_starpu_data_check_not_busy(handle);
			
 
				 
			
 
				 			break;
			
 
				 		case STARPU_SHARED:
			
@@ -156,11 +180,13 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 
				 			cnt = 0;
			
 
				 			for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				 			{
			
 
				-				if (handle->per_node[i].state == STARPU_SHARED) {
			
 
				-					cnt++; 
			
 
				+				if (handle->per_node[i].state == STARPU_SHARED)
			
 
				+				{
			
 
				+					cnt++;
			
 
				 					last = i;
			
 
				 				}
			
 
				 			}
			
 
				+			STARPU_ASSERT(cnt > 0);
			
 
				 
			
 
				 			if (cnt == 1)
			
 
				 				handle->per_node[last].state = STARPU_OWNER;
			
@@ -174,7 +200,8 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 
				 			break;
			
 
				 		}
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* lock all sub-subtrees children */
			
 
				 		unsigned child;
			
 
				 		for (child = 0; child < handle->nchildren; child++)
			
@@ -185,20 +212,20 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
			
 
				+static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, uint32_t node)
			
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				 	STARPU_ASSERT(mc->ops);
			
 
				 	STARPU_ASSERT(mc->ops->free_data_on_node);
			
 
				 
			
 
				-	starpu_data_handle handle = mc->data;
			
 
				+	starpu_data_handle_t handle = mc->data;
			
 
				 
			
 
				 	/* Does this memory chunk refers to a handle that does not exist
			
 
				 	 * anymore ? */
			
 
				 	unsigned data_was_deleted = mc->data_was_deleted;
			
 
				 
			
 
				-	struct starpu_data_replicate_s *replicate = mc->replicate;
			
 
				+	struct _starpu_data_replicate *replicate = mc->replicate;
			
 
				 
			
 
				 //	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 //		_starpu_datawizard_progress(_starpu_get_local_memory_node());
			
@@ -208,20 +235,20 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 
				 #endif
			
 
				 //	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				-	if (mc->automatically_allocated && 
			
 
				+	if (mc->automatically_allocated &&
			
 
				 		(!handle || data_was_deleted || replicate->refcnt == 0))
			
 
				 	{
			
 
				 		if (handle && !data_was_deleted)
			
 
				 			STARPU_ASSERT(replicate->allocated);
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				-		if (_starpu_get_node_kind(node) == STARPU_CUDA_RAM)
			
 
				+		if (starpu_node_get_kind(node) == STARPU_CUDA_RAM)
			
 
				 		{
			
 
				 			/* To facilitate the design of interface, we set the
			
 
				 			 * proper CUDA device in case it is needed. This avoids
			
 
				 			 * having to set it again in the free method of each
			
 
				 			 * interface. */
			
 
				-			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(node));
			
 
				+			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(node));
			
 
				 			STARPU_ASSERT(err == cudaSuccess);
			
 
				 		}
			
 
				 #endif
			
@@ -249,35 +276,47 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 
				 
			
 
				 
			
 
				 
			
 
				-static size_t do_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
			
 
				+static size_t do_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
			
 
				 {
			
 
				 	size_t size;
			
 
				 
			
 
				+	mc->replicate->mc=NULL;
			
 
				+
			
 
				 	/* free the actual buffer */
			
 
				 	size = free_memory_on_node(mc, node);
			
 
				 
			
 
				 	/* remove the mem_chunk from the list */
			
 
				-	starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				+	_starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				 
			
 
				 	free(mc->chunk_interface);
			
 
				-	starpu_mem_chunk_delete(mc);
			
 
				+	_starpu_mem_chunk_delete(mc);
			
 
				 
			
 
				-	return size; 
			
 
				+	return size;
			
 
				 }
			
 
				 
			
 
				 /* This function is called for memory chunks that are possibly in used (ie. not
			
 
				  * in the cache). They should therefore still be associated to a handle. */
			
 
				-static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
			
 
				+static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
			
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				-	starpu_data_handle handle;
			
 
				+	starpu_data_handle_t handle;
			
 
				 	handle = mc->data;
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	/* Either it's a "relaxed coherency" memchunk, or it's a memchunk that
			
 
				-	 * could be used with filters. */
			
 
				-	if (mc->relaxed_coherency)
			
 
				+	/* This data should be written through to this node, avoid dropping it! */
			
 
				+	if (handle->wt_mask & (1<<node))
			
 
				+		return 0;
			
 
				+
			
 
				+	/* REDUX memchunk */
			
 
				+	if (mc->relaxed_coherency == 2)
			
 
				+	{
			
 
				+		/* TODO: reduce it back to e.g. main memory */
			
 
				+	}
			
 
				+	else
			
 
				+	/* Either it's a "relaxed coherency" memchunk (SCRATCH), or it's a
			
 
				+	 * memchunk that could be used with filters. */
			
 
				+	if (mc->relaxed_coherency == 1)
			
 
				 	{
			
 
				 		STARPU_ASSERT(mc->replicate);
			
 
				 
			
@@ -295,25 +334,35 @@ static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
 
				 
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* try to lock all the leafs of the subtree */
			
 
				 		lock_all_subtree(handle);
			
 
				-	
			
 
				+
			
 
				 		/* check if they are all "free" */
			
 
				 		if (may_free_subtree(handle, node))
			
 
				 		{
			
 
				 			STARPU_ASSERT(handle->per_node[node].refcnt == 0);
			
 
				-	
			
 
				-			/* in case there was nobody using that buffer, throw it 
			
 
				+
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+			if (handle->per_node[node].state == STARPU_OWNER)
			
 
				+				_starpu_handle_stats_invalidated(handle, node);
			
 
				+			/* else XXX Considering only owner to invalidate */
			
 
				+#endif
			
 
				+
			
 
				+			/* in case there was nobody using that buffer, throw it
			
 
				 			 * away after writing it back to main memory */
			
 
				 			transfer_subtree_to_node(handle, node, 0);
			
 
				-	
			
 
				+
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+			_starpu_handle_stats_loaded_owner(handle, 0);
			
 
				+#endif
			
 
				 			STARPU_ASSERT(handle->per_node[node].refcnt == 0);
			
 
				-	
			
 
				+
			
 
				 			/* now the actual buffer may be freed */
			
 
				 			freed = do_free_mem_chunk(mc, node);
			
 
				 		}
			
 
				-	
			
 
				+
			
 
				 		/* unlock the leafs */
			
 
				 		unlock_all_subtree(handle);
			
 
				 	}
			
@@ -324,21 +373,18 @@ static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
 
				 /* We assume that mc_rwlock[node] is taken. is_already_in_mc_list indicates
			
 
				  * that the mc is already in the list of buffers that are possibly used, and
			
 
				  * therefore not in the cache. */
			
 
				-static void reuse_mem_chunk(unsigned node, struct starpu_data_replicate_s *new_replicate, starpu_mem_chunk_t mc, unsigned is_already_in_mc_list)
			
 
				+static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
			
 
				 {
			
 
				-	starpu_data_handle old_data;
			
 
				-	old_data = mc->data;
			
 
				-
			
 
				 	/* we found an appropriate mem chunk: so we get it out
			
 
				 	 * of the "to free" list, and reassign it to the new
			
 
				 	 * piece of data */
			
 
				 
			
 
				 	if (!is_already_in_mc_list)
			
 
				 	{
			
 
				-		starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
			
 
				+		_starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
			
 
				 	}
			
 
				 
			
 
				-	struct starpu_data_replicate_s *old_replicate = mc->replicate;
			
 
				+	struct _starpu_data_replicate *old_replicate = mc->replicate;
			
 
				 	old_replicate->allocated = 0;
			
 
				 	old_replicate->automatically_allocated = 0;
			
 
				 	old_replicate->initialized = 0;
			
@@ -347,27 +393,27 @@ static void reuse_mem_chunk(unsigned node, struct starpu_data_replicate_s *new_r
 
				 	new_replicate->automatically_allocated = 1;
			
 
				 	new_replicate->initialized = 0;
			
 
				 
			
 
				-	STARPU_ASSERT(new_replicate->chunk_interface);
			
 
				+	STARPU_ASSERT(new_replicate->data_interface);
			
 
				 	STARPU_ASSERT(mc->chunk_interface);
			
 
				-	memcpy(new_replicate->chunk_interface, mc->chunk_interface, old_replicate->ops->interface_size);
			
 
				+	memcpy(new_replicate->data_interface, mc->chunk_interface, old_replicate->handle->ops->interface_size);
			
 
				 
			
 
				 	mc->data = new_replicate->handle;
			
 
				 	mc->data_was_deleted = 0;
			
 
				 	/* mc->ops, mc->size, mc->footprint and mc->interface should be
			
 
				  	 * unchanged ! */
			
 
				-	
			
 
				+
			
 
				 	/* reinsert the mem chunk in the list of active memory chunks */
			
 
				 	if (!is_already_in_mc_list)
			
 
				 	{
			
 
				-		starpu_mem_chunk_list_push_front(mc_list[node], mc);
			
 
				+		_starpu_mem_chunk_list_push_front(mc_list[node], mc);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, starpu_data_handle new_data, unsigned is_already_in_mc_list)
			
 
				+static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
			
 
				 {
			
 
				 	unsigned success = 0;
			
 
				 
			
 
				-	starpu_data_handle old_data;
			
 
				+	starpu_data_handle_t old_data;
			
 
				 
			
 
				 	old_data = mc->data;
			
 
				 
			
@@ -381,12 +427,12 @@ static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, sta
 
				 	{
			
 
				 		success = 1;
			
 
				 
			
 
				-		/* in case there was nobody using that buffer, throw it 
			
 
				+		/* in case there was nobody using that buffer, throw it
			
 
				 		 * away after writing it back to main memory */
			
 
				 		transfer_subtree_to_node(old_data, node, 0);
			
 
				 
			
 
				 		/* now replace the previous data */
			
 
				-		reuse_mem_chunk(node, new_data, mc, is_already_in_mc_list);
			
 
				+		reuse_mem_chunk(node, replicate, mc, is_already_in_mc_list);
			
 
				 	}
			
 
				 
			
 
				 	/* unlock the leafs */
			
@@ -395,38 +441,38 @@ static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, sta
 
				 	return success;
			
 
				 }
			
 
				 
			
 
				-static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_data_interface_ops_t *ops_a,
			
 
				-                                          void *data_interface_b, struct starpu_data_interface_ops_t *ops_b)
			
 
				+static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_data_interface_ops *ops_a,
			
 
				+                                          void *data_interface_b, struct starpu_data_interface_ops *ops_b)
			
 
				 {
			
 
				 	if (ops_a->interfaceid != ops_b->interfaceid)
			
 
				 		return -1;
			
 
				 
			
 
				-	int ret = ops_a->compare(interface_a, interface_b);
			
 
				+	int ret = ops_a->compare(data_interface_a, data_interface_b);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 /* This function must be called with mc_rwlock[node] taken in write mode */
			
 
				-static starpu_mem_chunk_t _starpu_memchunk_cache_lookup_locked(uint32_t node, starpu_data_handle handle)
			
 
				+static struct _starpu_mem_chunk *_starpu_memchunk_cache_lookup_locked(uint32_t node, starpu_data_handle_t handle)
			
 
				 {
			
 
				 	uint32_t footprint = _starpu_compute_data_footprint(handle);
			
 
				 
			
 
				 	/* go through all buffers in the cache */
			
 
				-	starpu_mem_chunk_t mc;
			
 
				-	for (mc = starpu_mem_chunk_list_begin(memchunk_cache[node]);
			
 
				-	     mc != starpu_mem_chunk_list_end(memchunk_cache[node]);
			
 
				-	     mc = starpu_mem_chunk_list_next(mc))
			
 
				+	struct _starpu_mem_chunk *mc;
			
 
				+	for (mc = _starpu_mem_chunk_list_begin(memchunk_cache[node]);
			
 
				+	     mc != _starpu_mem_chunk_list_end(memchunk_cache[node]);
			
 
				+	     mc = _starpu_mem_chunk_list_next(mc))
			
 
				 	{
			
 
				 		if (mc->footprint == footprint)
			
 
				 		{
			
 
				 			/* Is that a false hit ? (this is _very_ unlikely) */
			
 
				-			if (_starpu_data_interface_compare(handle->per_node[node].interface, handle->ops, mc->interface, mc->ops))
			
 
				+			if (_starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->chunk_interface, mc->ops))
			
 
				 				continue;
			
 
				 
			
 
				 			/* Cache hit */
			
 
				 
			
 
				 			/* Remove from the cache */
			
 
				-			starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
			
 
				+			_starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
			
 
				 			return mc;
			
 
				 		}
			
 
				 	}
			
@@ -438,33 +484,33 @@ static starpu_mem_chunk_t _starpu_memchunk_cache_lookup_locked(uint32_t node, st
 
				 /* this function looks for a memory chunk that matches a given footprint in the
			
 
				  * list of mem chunk that need to be freed. This function must be called with
			
 
				  * mc_rwlock[node] taken in write mode. */
			
 
				-static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle data, uint32_t footprint)
			
 
				+static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
			
 
				 {
			
 
				-	starpu_mem_chunk_t mc, next_mc;
			
 
				+	struct _starpu_mem_chunk *mc, *next_mc;
			
 
				 
			
 
				 	/* go through all buffers in the cache */
			
 
				-	mc = _starpu_memchunk_cache_lookup_locked(node, handle);
			
 
				+	mc = _starpu_memchunk_cache_lookup_locked(node, data);
			
 
				 	if (mc)
			
 
				 	{
			
 
				 		/* We found an entry in the cache so we can reuse it */
			
 
				-		reuse_mem_chunk(node, data, mc, 0);
			
 
				+		reuse_mem_chunk(node, replicate, mc, 0);
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
 
				 	/* now look for some non essential data in the active list */
			
 
				-	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-	     mc != starpu_mem_chunk_list_end(mc_list[node]);
			
 
				+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				 	     mc = next_mc)
			
 
				 	{
			
 
				 		/* there is a risk that the memory chunk is freed before next
			
 
				 		 * iteration starts: so we compute the next element of the list
			
 
				 		 * now */
			
 
				-		next_mc = starpu_mem_chunk_list_next(mc);
			
 
				+		next_mc = _starpu_mem_chunk_list_next(mc);
			
 
				 
			
 
				 		if (mc->data->is_not_important && (mc->footprint == footprint))
			
 
				 		{
			
 
				 //			fprintf(stderr, "found a candidate ...\n");
			
 
				-			if (try_to_reuse_mem_chunk(mc, node, data, 1))
			
 
				+			if (try_to_reuse_mem_chunk(mc, node, replicate, 1))
			
 
				 				return 1;
			
 
				 		}
			
 
				 	}
			
@@ -477,24 +523,26 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 
				  * Free the memory chuncks that are explicitely tagged to be freed. The
			
 
				  * mc_rwlock[node] rw-lock should be taken prior to calling this function.
			
 
				  */
			
 
				-static size_t flush_memchunk_cache(uint32_t node)
			
 
				+static size_t flush_memchunk_cache(uint32_t node, size_t reclaim)
			
 
				 {
			
 
				-	starpu_mem_chunk_t mc, next_mc;
			
 
				-	
			
 
				+	struct _starpu_mem_chunk *mc, *next_mc;
			
 
				+
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				-	for (mc = starpu_mem_chunk_list_begin(memchunk_cache[node]);
			
 
				-	     mc != starpu_mem_chunk_list_end(memchunk_cache[node]);
			
 
				+	for (mc = _starpu_mem_chunk_list_begin(memchunk_cache[node]);
			
 
				+	     mc != _starpu_mem_chunk_list_end(memchunk_cache[node]);
			
 
				 	     mc = next_mc)
			
 
				 	{
			
 
				-		next_mc = starpu_mem_chunk_list_next(mc);
			
 
				+		next_mc = _starpu_mem_chunk_list_next(mc);
			
 
				 
			
 
				 		freed += free_memory_on_node(mc, node);
			
 
				 
			
 
				-		starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
			
 
				+		_starpu_mem_chunk_list_erase(memchunk_cache[node], mc);
			
 
				 
			
 
				 		free(mc->chunk_interface);
			
 
				-		starpu_mem_chunk_delete(mc);
			
 
				+		_starpu_mem_chunk_delete(mc);
			
 
				+		if (reclaim && freed>reclaim)
			
 
				+			break;
			
 
				 	}
			
 
				 
			
 
				 	return freed;
			
@@ -506,30 +554,31 @@ static size_t flush_memchunk_cache(uint32_t node)
 
				  * should only be used at the termination of StarPU for instance). The
			
 
				  * mc_rwlock[node] rw-lock should be taken prior to calling this function.
			
 
				  */
			
 
				-static size_t free_potentially_in_use_mc(uint32_t node, unsigned force)
			
 
				+static size_t free_potentially_in_use_mc(uint32_t node, unsigned force, size_t reclaim)
			
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				-	starpu_mem_chunk_t mc, next_mc;
			
 
				+	struct _starpu_mem_chunk *mc, *next_mc;
			
 
				 
			
 
				-	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-	     mc != starpu_mem_chunk_list_end(mc_list[node]);
			
 
				+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				 	     mc = next_mc)
			
 
				 	{
			
 
				-		/* there is a risk that the memory chunk is freed 
			
 
				+		/* there is a risk that the memory chunk is freed
			
 
				 		   before next iteration starts: so we compute the next
			
 
				 		   element of the list now */
			
 
				-		next_mc = starpu_mem_chunk_list_next(mc);
			
 
				+		next_mc = _starpu_mem_chunk_list_next(mc);
			
 
				 
			
 
				 		if (!force)
			
 
				 		{
			
 
				 			freed += try_to_free_mem_chunk(mc, node);
			
 
				-			#if 0
			
 
				-			if (freed > toreclaim)
			
 
				+			#if 1
			
 
				+			if (reclaim && freed > reclaim)
			
 
				 				break;
			
 
				 			#endif
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* We must free the memory now: note that data
			
 
				 			 * coherency is not maintained in that case ! */
			
 
				 			freed += do_free_mem_chunk(mc, node);
			
@@ -539,19 +588,22 @@ static size_t free_potentially_in_use_mc(uint32_t node, unsigned force)
 
				 	return freed;
			
 
				 }
			
 
				 
			
 
				-static size_t reclaim_memory_generic(uint32_t node, unsigned force)
			
 
				+static size_t reclaim_memory_generic(uint32_t node, unsigned force, size_t reclaim)
			
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				+
			
 
				+	starpu_lru(node);
			
 
				 
			
 
				 	/* remove all buffers for which there was a removal request */
			
 
				-	freed += flush_memchunk_cache(node);
			
 
				+	freed += flush_memchunk_cache(node, reclaim);
			
 
				 
			
 
				 	/* try to free all allocated data potentially in use */
			
 
				-	freed += free_potentially_in_use_mc(node, force);
			
 
				+	if (reclaim && freed<reclaim)
			
 
				+		freed += free_potentially_in_use_mc(node, force, reclaim);
			
 
				 
			
 
				-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				 	return freed;
			
 
				 
			
@@ -564,13 +616,13 @@ static size_t reclaim_memory_generic(uint32_t node, unsigned force)
 
				  */
			
 
				 size_t _starpu_free_all_automatically_allocated_buffers(uint32_t node)
			
 
				 {
			
 
				-	return reclaim_memory_generic(node, 1);
			
 
				+	return reclaim_memory_generic(node, 1, 0);
			
 
				 }
			
 
				 
			
 
				-static starpu_mem_chunk_t _starpu_memchunk_init(struct starpu_data_replicate_s *replicate, size_t size, size_t interface_size, unsigned automatically_allocated)
			
 
				+static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_replicate *replicate, size_t size, size_t interface_size, unsigned automatically_allocated)
			
 
				 {
			
 
				-	starpu_mem_chunk_t mc = starpu_mem_chunk_new();
			
 
				-	starpu_data_handle handle = replicate->handle;
			
 
				+	struct _starpu_mem_chunk *mc = _starpu_mem_chunk_new();
			
 
				+	starpu_data_handle_t handle = replicate->handle;
			
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 	STARPU_ASSERT(handle->ops);
			
@@ -581,8 +633,9 @@ static starpu_mem_chunk_t _starpu_memchunk_init(struct starpu_data_replicate_s *
 
				 	mc->ops = handle->ops;
			
 
				 	mc->data_was_deleted = 0;
			
 
				 	mc->automatically_allocated = automatically_allocated;
			
 
				-	mc->relaxed_coherency = replicate->relaxed_coherency;		
			
 
				+	mc->relaxed_coherency = replicate->relaxed_coherency;
			
 
				 	mc->replicate = replicate;
			
 
				+	mc->replicate->mc = mc;
			
 
				 
			
 
				 	/* Save a copy of the interface */
			
 
				 	mc->chunk_interface = malloc(interface_size);
			
@@ -592,49 +645,50 @@ static starpu_mem_chunk_t _starpu_memchunk_init(struct starpu_data_replicate_s *
 
				 	return mc;
			
 
				 }
			
 
				 
			
 
				-static void register_mem_chunk(struct starpu_data_replicate_s *replicate, size_t size, unsigned automatically_allocated)
			
 
				+static void register_mem_chunk(struct _starpu_data_replicate *replicate, size_t size, unsigned automatically_allocated)
			
 
				 {
			
 
				 	unsigned dst_node = replicate->memory_node;
			
 
				 
			
 
				-	starpu_mem_chunk_t mc;
			
 
				+	struct _starpu_mem_chunk *mc;
			
 
				 
			
 
				 	/* the interface was already filled by ops->allocate_data_on_node */
			
 
				 	size_t interface_size = replicate->handle->ops->interface_size;
			
 
				 
			
 
				 	/* Put this memchunk in the list of memchunk in use */
			
 
				-	mc = _starpu_memchunk_init(replicate, size, interface_size, automatically_allocated); 
			
 
				+	mc = _starpu_memchunk_init(replicate, size, interface_size, automatically_allocated);
			
 
				 
			
 
				-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
			
 
				 
			
 
				-	starpu_mem_chunk_list_push_front(mc_list[dst_node], mc);
			
 
				+	_starpu_mem_chunk_list_push_back(mc_list[dst_node], mc);
			
 
				 
			
 
				-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
			
 
				 }
			
 
				 
			
 
				 /* This function is called when the handle is destroyed (eg. when calling
			
 
				  * unregister or unpartition). It puts all the memchunks that refer to the
			
 
				  * specified handle into the cache. */
			
 
				-void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node)
			
 
				+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node)
			
 
				 {
			
 
				-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				 	/* iterate over the list of memory chunks and remove the entry */
			
 
				-	starpu_mem_chunk_t mc, next_mc;
			
 
				-	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-	     mc != starpu_mem_chunk_list_end(mc_list[node]);
			
 
				+	struct _starpu_mem_chunk *mc, *next_mc;
			
 
				+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				 	     mc = next_mc)
			
 
				 	{
			
 
				-		next_mc = starpu_mem_chunk_list_next(mc);
			
 
				+		next_mc = _starpu_mem_chunk_list_next(mc);
			
 
				 
			
 
				-		if (mc->data == handle) {
			
 
				+		if (mc->data == handle)
			
 
				+		{
			
 
				 			/* we found the data */
			
 
				 			mc->data_was_deleted = 1;
			
 
				 
			
 
				 			/* remove it from the main list */
			
 
				-			starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				+			_starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				 
			
 
				 			/* put it in the list of buffers to be removed */
			
 
				-			starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
			
 
				+			_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
			
 
				 
			
 
				 			/* Note that we do not stop here because there can be
			
 
				 			 * multiple replicates associated to the same handle on
			
@@ -643,7 +697,42 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node)
 
				 	}
			
 
				 
			
 
				 	/* there was no corresponding buffer ... */
			
 
				-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+}
			
 
				+
			
 
				+static size_t _starpu_get_global_mem_size(int dst_node)
			
 
				+{
			
 
				+	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				+	size_t global_mem_size;
			
 
				+
			
 
				+	switch(kind)
			
 
				+	{
			
 
				+		case STARPU_CPU_RAM:
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning to be fixed
			
 
				+#endif
			
 
				+			global_mem_size = 64*1024*1024;
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+		{
			
 
				+			int devid = _starpu_memory_node_to_devid(dst_node);
			
 
				+			global_mem_size = starpu_cuda_get_global_mem_size(devid);
			
 
				+			break;
			
 
				+		}
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+		{
			
 
				+			int devid = _starpu_memory_node_to_devid(dst_node);
			
 
				+			global_mem_size = starpu_opencl_get_global_mem_size(devid);
			
 
				+			break;
			
 
				+		}
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ASSERT(0);
			
 
				+	}
			
 
				+	return global_mem_size;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -658,7 +747,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node)
 
				  *
			
 
				  */
			
 
				 
			
 
				-static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, uint32_t dst_node)
			
 
				+static ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t dst_node, unsigned is_prefetch)
			
 
				 {
			
 
				 	unsigned attempts = 0;
			
 
				 	ssize_t allocated_memory;
			
@@ -669,64 +758,81 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct star
 
				 	/* perhaps we can directly reuse a buffer in the free-list */
			
 
				 	uint32_t footprint = _starpu_compute_data_footprint(handle);
			
 
				 
			
 
				-	STARPU_TRACE_START_ALLOC_REUSE(dst_node);
			
 
				-	PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				+	_STARPU_TRACE_START_ALLOC_REUSE(dst_node);
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
			
 
				 
			
 
				-	if (try_to_find_reusable_mem_chunk(dst_node, handle, footprint))
			
 
				+	if (try_to_find_reusable_mem_chunk(dst_node, handle, replicate, footprint))
			
 
				 	{
			
 
				-		PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+		_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
			
 
				 		_starpu_allocation_cache_hit(dst_node);
			
 
				 		ssize_t data_size = _starpu_data_get_size(handle);
			
 
				 		return data_size;
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				-	STARPU_TRACE_END_ALLOC_REUSE(dst_node);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
			
 
				+	_STARPU_TRACE_END_ALLOC_REUSE(dst_node);
			
 
				 #endif
			
 
				 
			
 
				-	do {
			
 
				+	do
			
 
				+	{
			
 
				 		STARPU_ASSERT(handle->ops);
			
 
				 		STARPU_ASSERT(handle->ops->allocate_data_on_node);
			
 
				 
			
 
				-		STARPU_TRACE_START_ALLOC(dst_node);
			
 
				+		_STARPU_TRACE_START_ALLOC(dst_node);
			
 
				 		STARPU_ASSERT(replicate->data_interface);
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				-		if (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				+		if (starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				 		{
			
 
				 			/* To facilitate the design of interface, we set the
			
 
				 			 * proper CUDA device in case it is needed. This avoids
			
 
				 			 * having to set it again in the malloc method of each
			
 
				 			 * interface. */
			
 
				-			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(dst_node));
			
 
				+			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(dst_node));
			
 
				 			STARPU_ASSERT(err == cudaSuccess);
			
 
				 		}
			
 
				 #endif
			
 
				 
			
 
				 		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
			
 
				-		STARPU_TRACE_END_ALLOC(dst_node);
			
 
				+		_STARPU_TRACE_END_ALLOC(dst_node);
			
 
				 
			
 
				 		if (allocated_memory == -ENOMEM)
			
 
				 		{
			
 
				+			size_t reclaim = 0.25*_starpu_get_global_mem_size(dst_node);
			
 
				+			if (starpu_memstrategy_data_size_coefficient*handle->data_size > reclaim)
			
 
				+				reclaim = starpu_memstrategy_data_size_coefficient*handle->data_size;
			
 
				+
			
 
				+			/* Take temporary reference on the replicate */
			
 
				 			replicate->refcnt++;
			
 
				+			handle->busy_count++;
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				-			STARPU_TRACE_START_MEMRECLAIM(dst_node);
			
 
				-			reclaim_memory_generic(dst_node, 0);
			
 
				-			STARPU_TRACE_END_MEMRECLAIM(dst_node);
			
 
				+			_STARPU_TRACE_START_MEMRECLAIM(dst_node);
			
 
				+			if (is_prefetch) {
			
 
				+				_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
			
 
				+				flush_memchunk_cache(dst_node, reclaim);
			
 
				+				_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
			
 
				+			} else
			
 
				+				reclaim_memory_generic(dst_node, 0, reclaim);
			
 
				+			_STARPU_TRACE_END_MEMRECLAIM(dst_node);
			
 
				 
			
 
				 		        while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 		                _starpu_datawizard_progress(_starpu_get_local_memory_node(), 0);
			
 
				-		
			
 
				+
			
 
				 			replicate->refcnt--;
			
 
				+			STARPU_ASSERT(replicate->refcnt >= 0);
			
 
				+			STARPU_ASSERT(handle->busy_count > 0);
			
 
				+			handle->busy_count--;
			
 
				+			_starpu_data_check_not_busy(handle);
			
 
				 		}
			
 
				-		
			
 
				-	} while((allocated_memory == -ENOMEM) && attempts++ < 2);
			
 
				+
			
 
				+	}
			
 
				+	while((allocated_memory == -ENOMEM) && attempts++ < 2);
			
 
				 
			
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *replicate)
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
			
 
				 {
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
@@ -739,7 +845,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_dat
 
				 		return 0;
			
 
				 
			
 
				 	STARPU_ASSERT(replicate->data_interface);
			
 
				-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node);
			
 
				+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
			
 
				 
			
 
				 	/* perhaps we could really not handle that capacity misses */
			
 
				 	if (allocated_memory == -ENOMEM)
			
@@ -762,7 +868,74 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_dat
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node)
			
 
				+unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, uint32_t memory_node)
			
 
				 {
			
 
				 	return handle->per_node[memory_node].allocated;
			
 
				 }
			
 
				+
			
 
				+void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
			
 
				+{
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&lru_rwlock[node]);
			
 
				+	struct _starpu_mem_chunk_lru *mc_lru=_starpu_mem_chunk_lru_new();
			
 
				+	mc_lru->mc=mc;
			
 
				+	_starpu_mem_chunk_lru_list_push_front(starpu_lru_list[node],mc_lru);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&lru_rwlock[node]);
			
 
				+}
			
 
				+
			
 
				+/* The mc_rwlock[node] rw-lock should be taken prior to calling this function.*/
			
 
				+static void _starpu_memchunk_recently_used_move(struct _starpu_mem_chunk *mc, unsigned node)
			
 
				+{
			
 
				+	/* XXX Sometimes the memchunk is not in the list... */
			
 
				+	struct _starpu_mem_chunk *mc_iter;
			
 
				+	for (mc_iter = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc_iter != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				+	     mc_iter = _starpu_mem_chunk_list_next(mc_iter) )
			
 
				+	{
			
 
				+		if (mc_iter==mc)
			
 
				+		{
			
 
				+			_starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				+			_starpu_mem_chunk_list_push_back(mc_list[node], mc);
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void starpu_lru(unsigned node)
			
 
				+{
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&lru_rwlock[node]);
			
 
				+	while (!_starpu_mem_chunk_lru_list_empty(starpu_lru_list[node]))
			
 
				+	{
			
 
				+		struct _starpu_mem_chunk_lru *mc_lru=_starpu_mem_chunk_lru_list_front(starpu_lru_list[node]);
			
 
				+		_starpu_memchunk_recently_used_move(mc_lru->mc, node);
			
 
				+		_starpu_mem_chunk_lru_list_erase(starpu_lru_list[node], mc_lru);
			
 
				+		_starpu_mem_chunk_lru_delete(mc_lru);
			
 
				+	}
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&lru_rwlock[node]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_MEMORY_STATUS
			
 
				+void _starpu_display_data_stats_by_node(int node)
			
 
				+{
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				+
			
 
				+	if (!_starpu_mem_chunk_list_empty(mc_list[node]))
			
 
				+	{
			
 
				+		fprintf(stderr, "#-------\n");
			
 
				+		fprintf(stderr, "Data on Node #%d\n",node);
			
 
				+
			
 
				+		struct _starpu_mem_chunk *mc;
			
 
				+
			
 
				+		for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				+		     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				+		     mc = _starpu_mem_chunk_list_next(mc))
			
 
				+		{
			
 
				+			_starpu_display_data_handle_stats(mc->data);
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+}
			
 
				+#endif
			
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -26,21 +26,21 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				 
			
 
				-struct starpu_data_replicate_s;
			
 
				+struct _starpu_data_replicate;
			
 
				 
			
 
				-LIST_TYPE(starpu_mem_chunk,
			
 
				-	starpu_data_handle data;
			
 
				+LIST_TYPE(_starpu_mem_chunk,
			
 
				+	starpu_data_handle_t data;
			
 
				 	size_t size;
			
 
				 
			
 
				 	uint32_t footprint;
			
 
				-	
			
 
				+
			
 
				 	/* The footprint of the data is not sufficient to determine whether two
			
 
				 	 * pieces of data have the same layout (there could be collision in the
			
 
				 	 * hash function ...) so we still keep a copy of the actual layout (ie.
			
 
				 	 * the data interface) to stay on the safe side. We make a copy of
			
 
				 	 * because when a data is deleted, the memory chunk remains.
			
 
				 	 */
			
 
				-	struct starpu_data_interface_ops_t *ops;
			
 
				+	struct starpu_data_interface_ops *ops;
			
 
				 	void *chunk_interface;
			
 
				 	unsigned automatically_allocated;
			
 
				 	unsigned data_was_deleted;
			
@@ -48,12 +48,20 @@ LIST_TYPE(starpu_mem_chunk,
 
				 	/* A buffer that is used for SCRATCH or reduction cannnot be used with
			
 
				 	 * filters. */
			
 
				 	unsigned relaxed_coherency;
			
 
				-	struct starpu_data_replicate_s *replicate;
			
 
				+	struct _starpu_data_replicate *replicate;
			
 
				+)
			
 
				+
			
 
				+/* LRU list */
			
 
				+LIST_TYPE(_starpu_mem_chunk_lru,
			
 
				+	struct _starpu_mem_chunk *mc;
			
 
				 )
			
 
				 
			
 
				 void _starpu_init_mem_chunk_lists(void);
			
 
				 void _starpu_deinit_mem_chunk_lists(void);
			
 
				-void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node);
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *replicate);
			
 
				+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node);
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
			
 
				 size_t _starpu_free_all_automatically_allocated_buffers(uint32_t node);
			
 
				+void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
			
 
				+
			
 
				+void _starpu_display_data_stats_by_node(int node);
			
 
				 #endif
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,20 +23,23 @@
 
				 #include "copy_driver.h"
			
 
				 #include "memalloc.h"
			
 
				 
			
 
				-static starpu_mem_node_descr descr;
			
 
				+static struct _starpu_mem_node_descr descr;
			
 
				 static pthread_key_t memory_node_key;
			
 
				 
			
 
				 void _starpu_init_memory_nodes(void)
			
 
				 {
			
 
				-	/* there is no node yet, subsequent nodes will be 
			
 
				+	/* there is no node yet, subsequent nodes will be
			
 
				 	 * added using _starpu_register_memory_node */
			
 
				 	descr.nnodes = 0;
			
 
				 
			
 
				 	pthread_key_create(&memory_node_key, NULL);
			
 
				 
			
 
				 	unsigned i;
			
 
				-	for (i = 0; i < STARPU_MAXNODES; i++) 
			
 
				-		descr.nodes[i] = STARPU_UNUSED; 
			
 
				+	for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				+	{
			
 
				+		descr.nodes[i] = STARPU_UNUSED;
			
 
				+		descr.nworkers[i] = 0;
			
 
				+	}
			
 
				 
			
 
				 	_starpu_init_mem_chunk_lists();
			
 
				 	_starpu_init_data_request_lists();
			
@@ -62,8 +65,8 @@ unsigned _starpu_get_local_memory_node(void)
 
				 {
			
 
				 	unsigned *memory_node;
			
 
				 	memory_node = (unsigned *) pthread_getspecific(memory_node_key);
			
 
				-	
			
 
				-	/* in case this is called by the programmer, we assume the RAM node 
			
 
				+
			
 
				+	/* in case this is called by the programmer, we assume the RAM node
			
 
				 	   is the appropriate memory node ... so we return 0 XXX */
			
 
				 	if (STARPU_UNLIKELY(!memory_node))
			
 
				 		return 0;
			
@@ -71,34 +74,44 @@ unsigned _starpu_get_local_memory_node(void)
 
				 	return *memory_node;
			
 
				 }
			
 
				 
			
 
				-starpu_mem_node_descr *_starpu_get_memory_node_description(void)
			
 
				+void _starpu_memory_node_worker_add(unsigned node)
			
 
				+{
			
 
				+	descr.nworkers[node]++;
			
 
				+}
			
 
				+
			
 
				+unsigned _starpu_memory_node_workers(unsigned node)
			
 
				+{
			
 
				+	return descr.nworkers[node];
			
 
				+}
			
 
				+
			
 
				+struct _starpu_mem_node_descr *_starpu_get_memory_node_description(void)
			
 
				 {
			
 
				 	return &descr;
			
 
				 }
			
 
				 
			
 
				-starpu_node_kind _starpu_get_node_kind(uint32_t node)
			
 
				+enum starpu_node_kind starpu_node_get_kind(uint32_t node)
			
 
				 {
			
 
				 	return descr.nodes[node];
			
 
				 }
			
 
				 
			
 
				-int starpu_memory_node_to_devid(unsigned node)
			
 
				+int _starpu_memory_node_to_devid(unsigned node)
			
 
				 {
			
 
				 	return descr.devid[node];
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_get_memory_nodes_count(void)
			
 
				+unsigned starpu_memory_nodes_get_count(void)
			
 
				 {
			
 
				 	return descr.nnodes;
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid)
			
 
				+unsigned _starpu_register_memory_node(enum starpu_node_kind kind, int devid)
			
 
				 {
			
 
				 	unsigned nnodes;
			
 
				 	/* ATOMIC_ADD returns the new value ... */
			
 
				 	nnodes = STARPU_ATOMIC_ADD(&descr.nnodes, 1);
			
 
				 
			
 
				 	descr.nodes[nnodes-1] = kind;
			
 
				-	STARPU_TRACE_NEW_MEM_NODE(nnodes-1);
			
 
				+	_STARPU_TRACE_NEW_MEM_NODE(nnodes-1);
			
 
				 
			
 
				 	descr.devid[nnodes-1] = devid;
			
 
				 
			
@@ -115,8 +128,8 @@ void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_
 
				 {
			
 
				 	unsigned cond_id;
			
 
				 	unsigned nconds_total, nconds;
			
 
				-	
			
 
				-	pthread_rwlock_wrlock(&descr.conditions_rwlock);
			
 
				+
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&descr.conditions_rwlock);
			
 
				 
			
 
				 	/* we only insert the queue if it's not already in the list */
			
 
				 	nconds = descr.condition_count[nodeid];
			
@@ -127,7 +140,7 @@ void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_
 
				 			STARPU_ASSERT(descr.conditions_attached_to_node[nodeid][cond_id].mutex == mutex);
			
 
				 
			
 
				 			/* the condition is already in the list */
			
 
				-			pthread_rwlock_unlock(&descr.conditions_rwlock);
			
 
				+			_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr.conditions_rwlock);
			
 
				 			return;
			
 
				 		}
			
 
				 	}
			
@@ -138,28 +151,28 @@ void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_
 
				 	descr.condition_count[nodeid]++;
			
 
				 
			
 
				 	/* do we have to add it in the global list as well ? */
			
 
				-	nconds_total = descr.total_condition_count; 
			
 
				+	nconds_total = descr.total_condition_count;
			
 
				 	for (cond_id = 0; cond_id < nconds_total; cond_id++)
			
 
				 	{
			
 
				 		if (descr.conditions_all[cond_id].cond == cond)
			
 
				 		{
			
 
				 			/* the queue is already in the global list */
			
 
				-			pthread_rwlock_unlock(&descr.conditions_rwlock);
			
 
				+			_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr.conditions_rwlock);
			
 
				 			return;
			
 
				 		}
			
 
				-	} 
			
 
				+	}
			
 
				 
			
 
				 	/* it was not in the global list either */
			
 
				 	descr.conditions_all[nconds_total].cond = cond;
			
 
				 	descr.conditions_all[nconds_total].mutex = mutex;
			
 
				 	descr.total_condition_count++;
			
 
				 
			
 
				-	pthread_rwlock_unlock(&descr.conditions_rwlock);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&descr.conditions_rwlock);
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_worker_get_memory_node(unsigned workerid)
			
 
				 {
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				 	/* This workerid may either be a basic worker or a combined worker */
			
 
				 	unsigned nworkers = config->topology.nworkers;
			
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,59 +23,55 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/memalloc.h>
			
 
				 
			
 
				-typedef enum {
			
 
				-	STARPU_UNUSED     = 0x00,
			
 
				-	STARPU_CPU_RAM    = 0x01,
			
 
				-	STARPU_CUDA_RAM   = 0x02,
			
 
				-	STARPU_OPENCL_RAM = 0x03,
			
 
				-	STARPU_SPU_LS     = 0x04
			
 
				-} starpu_node_kind;
			
 
				-
			
 
				-typedef starpu_node_kind starpu_memory_node_tuple;
			
 
				 
			
 
				 #define _STARPU_MEMORY_NODE_TUPLE(node1,node2) (node1 | (node2 << 4))
			
 
				 #define _STARPU_MEMORY_NODE_TUPLE_FIRST(tuple) (tuple & 0x0F)
			
 
				 #define _STARPU_MEMORY_NODE_TUPLE_SECOND(tuple) (tuple & 0xF0)
			
 
				 
			
 
				-struct _cond_and_mutex {
			
 
				+struct _starpu_cond_and_mutex
			
 
				+{
			
 
				         pthread_cond_t *cond;
			
 
				-        pthread_mutex_t *mutex;	
			
 
				+        pthread_mutex_t *mutex;
			
 
				 };
			
 
				 
			
 
				-typedef struct {
			
 
				+struct _starpu_mem_node_descr
			
 
				+{
			
 
				 	unsigned nnodes;
			
 
				-	starpu_node_kind nodes[STARPU_MAXNODES];
			
 
				+	enum starpu_node_kind nodes[STARPU_MAXNODES];
			
 
				 
			
 
				 	/* Get the device id associated to this node, or -1 if not applicable */
			
 
				 	int devid[STARPU_MAXNODES];
			
 
				 
			
 
				-	// TODO move this 2 lists outside starpu_mem_node_descr
			
 
				+	unsigned nworkers[STARPU_MAXNODES];
			
 
				+
			
 
				+	// TODO move this 2 lists outside struct _starpu_mem_node_descr
			
 
				 	/* Every worker is associated to a condition variable on which the
			
 
				 	 * worker waits when there is task available. It is possible that
			
 
				 	 * multiple worker share the same condition variable, so we maintain a
			
 
				 	 * list of all these condition variables so that we can wake up all
			
 
				 	 * worker attached to a memory node that are waiting on a task. */
			
 
				 	pthread_rwlock_t conditions_rwlock;
			
 
				-	struct _cond_and_mutex conditions_attached_to_node[STARPU_MAXNODES][STARPU_NMAXWORKERS];
			
 
				-	struct _cond_and_mutex conditions_all[STARPU_MAXNODES*STARPU_NMAXWORKERS];
			
 
				+	struct _starpu_cond_and_mutex conditions_attached_to_node[STARPU_MAXNODES][STARPU_NMAXWORKERS];
			
 
				+	struct _starpu_cond_and_mutex conditions_all[STARPU_MAXNODES*STARPU_NMAXWORKERS];
			
 
				 	/* the number of queues attached to each node */
			
 
				 	unsigned total_condition_count;
			
 
				 	unsigned condition_count[STARPU_MAXNODES];
			
 
				 
			
 
				-} starpu_mem_node_descr;
			
 
				+};
			
 
				 
			
 
				 void _starpu_init_memory_nodes(void);
			
 
				 void _starpu_deinit_memory_nodes(void);
			
 
				 void _starpu_set_local_memory_node_key(unsigned *node);
			
 
				 unsigned _starpu_get_local_memory_node(void);
			
 
				-unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid);
			
 
				+void _starpu_memory_node_worker_add(unsigned node);
			
 
				+unsigned _starpu_memory_node_workers(unsigned node);
			
 
				+unsigned _starpu_register_memory_node(enum starpu_node_kind kind, int devid);
			
 
				 //void _starpu_memory_node_attach_queue(struct starpu_jobq_s *q, unsigned nodeid);
			
 
				 void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_t *mutex, unsigned memory_node);
			
 
				 
			
 
				-starpu_node_kind _starpu_get_node_kind(uint32_t node);
			
 
				-int starpu_memory_node_to_devid(unsigned node);
			
 
				-unsigned _starpu_get_memory_nodes_count(void);
			
 
				+enum starpu_node_kind _starpu_node_get_kind(uint32_t node);
			
 
				+int _starpu_memory_node_to_devid(unsigned node);
			
 
				 
			
 
				-starpu_mem_node_descr *_starpu_get_memory_node_description(void);
			
 
				+struct _starpu_mem_node_descr *_starpu_get_memory_node_description(void);
			
 
				 
			
 
				 #endif // __MEMORY_NODES_H__
			
--- a/src/datawizard/progress.c
+++ b/src/datawizard/progress.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,6 +25,7 @@ void _starpu_datawizard_progress(uint32_t memory_node, unsigned may_alloc)
 
				 	/* in case some other driver requested data */
			
 
				 	_starpu_handle_pending_node_data_requests(memory_node);
			
 
				 	_starpu_handle_node_data_requests(memory_node, may_alloc);
			
 
				-
			
 
				+	_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
			
 
				 	_starpu_execute_registered_progression_hooks();
			
 
				 }
			
 
				+
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,20 +17,24 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <common/utils.h>
			
 
				+#include <util/starpu_data_cpy.h>
			
 
				 #include <core/task.h>
			
 
				 #include <datawizard/datawizard.h>
			
 
				 
			
 
				-void starpu_data_set_reduction_methods(starpu_data_handle handle,
			
 
				-					struct starpu_codelet_t *redux_cl,
			
 
				-					struct starpu_codelet_t *init_cl)
			
 
				+void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
			
 
				+				       struct starpu_codelet *redux_cl,
			
 
				+				       struct starpu_codelet *init_cl)
			
 
				 {
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				+	_starpu_codelet_check_deprecated_fields(redux_cl);
			
 
				+	_starpu_codelet_check_deprecated_fields(init_cl);
			
 
				+
			
 
				 	unsigned child;
			
 
				 	for (child = 0; child < handle->nchildren; child++)
			
 
				 	{
			
 
				 		/* make sure that the flags are applied to the children as well */
			
 
				-		struct starpu_data_state_t *child_handle = &handle->children[child];
			
 
				+		struct _starpu_data_state *child_handle = &handle->children[child];
			
 
				 		if (child_handle->nchildren > 0)
			
 
				 			starpu_data_set_reduction_methods(child_handle, redux_cl, init_cl);
			
 
				 	}
			
@@ -41,27 +45,28 @@ void starpu_data_set_reduction_methods(starpu_data_handle handle,
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				-void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, int workerid)
			
 
				+void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, int workerid)
			
 
				 {
			
 
				 	STARPU_ASSERT(replicate);
			
 
				 	STARPU_ASSERT(replicate->allocated);
			
 
				 
			
 
				-	struct starpu_codelet_t *init_cl = handle->init_cl;
			
 
				+	struct starpu_codelet *init_cl = handle->init_cl;
			
 
				 	STARPU_ASSERT(init_cl);
			
 
				 
			
 
				-	cl_func init_func = NULL;
			
 
				-	
			
 
				+	_starpu_cl_func_t init_func = NULL;
			
 
				+
			
 
				 	/* TODO Check that worker may execute the codelet */
			
 
				 
			
 
				-	switch (starpu_worker_get_type(workerid)) {
			
 
				+	switch (starpu_worker_get_type(workerid))
			
 
				+	{
			
 
				 		case STARPU_CPU_WORKER:
			
 
				-			init_func = init_cl->cpu_func;
			
 
				+			init_func = _starpu_task_get_cpu_nth_implementation(init_cl, 0);
			
 
				 			break;
			
 
				 		case STARPU_CUDA_WORKER:
			
 
				-			init_func = init_cl->cuda_func;
			
 
				+			init_func = _starpu_task_get_cuda_nth_implementation(init_cl, 0);
			
 
				 			break;
			
 
				 		case STARPU_OPENCL_WORKER:
			
 
				-			init_func = init_cl->opencl_func;
			
 
				+			init_func = _starpu_task_get_opencl_nth_implementation(init_cl, 0);
			
 
				 			break;
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
@@ -77,7 +82,7 @@ void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_
 
				 
			
 
				 /* Enable reduction mode. This function must be called with the header lock
			
 
				  * taken. */
			
 
				-void starpu_data_start_reduction_mode(starpu_data_handle handle)
			
 
				+void _starpu_data_start_reduction_mode(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle->reduction_refcnt == 0);
			
 
				 
			
@@ -86,22 +91,40 @@ void starpu_data_start_reduction_mode(starpu_data_handle handle)
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *replicate;
			
 
				+		struct _starpu_data_replicate *replicate;
			
 
				 		replicate = &handle->per_worker[worker];
			
 
				 		replicate->initialized = 0;
			
 
				+		replicate->relaxed_coherency = 2;
			
 
				+		if (replicate->mc)
			
 
				+			replicate->mc->relaxed_coherency = 2;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 //#define NO_TREE_REDUCTION
			
 
				 
			
 
				 /* Force reduction. The lock should already have been taken.  */
			
 
				-void starpu_data_end_reduction_mode(starpu_data_handle handle)
			
 
				+void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned worker;
			
 
				+	unsigned node;
			
 
				+	unsigned empty; /* Whether the handle is initially unallocated */
			
 
				 
			
 
				 	/* Put every valid replicate in the same array */
			
 
				 	unsigned replicate_count = 0;
			
 
				-	starpu_data_handle replicate_array[STARPU_NMAXWORKERS];
			
 
				+	starpu_data_handle_t replicate_array[1 + STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+	{
			
 
				+		if (handle->per_node[node].state != STARPU_INVALID)
			
 
				+			break;
			
 
				+	}
			
 
				+	empty = node == STARPU_MAXNODES;
			
 
				+
			
 
				+#ifndef NO_TREE_REDUCTION
			
 
				+	if (!empty)
			
 
				+		/* Include the initial value into the reduction tree */
			
 
				+		replicate_array[replicate_count++] = handle;
			
 
				+#endif
			
 
				 
			
 
				 	/* Register all valid per-worker replicates */
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
@@ -112,7 +135,7 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 
				 			/* Make sure the replicate is not removed */
			
 
				 			handle->per_worker[worker].refcnt++;
			
 
				 
			
 
				-			uint32_t home_node = starpu_worker_get_memory_node(worker); 
			
 
				+			uint32_t home_node = starpu_worker_get_memory_node(worker);
			
 
				 			starpu_data_register(&handle->reduction_tmp_handles[worker],
			
 
				 				home_node, handle->per_worker[worker].data_interface, handle->ops);
			
 
				 
			
@@ -120,21 +143,40 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 
				 
			
 
				 			replicate_array[replicate_count++] = handle->reduction_tmp_handles[worker];
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			handle->reduction_tmp_handles[worker] = NULL;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 #ifndef NO_TREE_REDUCTION
			
 
				-	handle->reduction_refcnt = 1;
			
 
				+	if (empty) {
			
 
				+		/* Only the final copy will touch the actual handle */
			
 
				+		handle->reduction_refcnt = 1;
			
 
				+	} else {
			
 
				+		unsigned step = 1;
			
 
				+		handle->reduction_refcnt = 0;
			
 
				+		while (step < replicate_count)
			
 
				+		{
			
 
				+			/* Each stage will touch the actual handle */
			
 
				+			handle->reduction_refcnt++;
			
 
				+			step *= 2;
			
 
				+		}
			
 
				+	}
			
 
				 #else
			
 
				 	/* We know that in this reduction algorithm there is exactly one task per valid replicate. */
			
 
				-	handle->reduction_refcnt = replicate_count;
			
 
				+	handle->reduction_refcnt = replicate_count + empty;
			
 
				 #endif
			
 
				 
			
 
				 //	fprintf(stderr, "REDUX REFCNT = %d\n", handle->reduction_refcnt);
			
 
				-	
			
 
				-	if (replicate_count > 0)
			
 
				+
			
 
				+	if (replicate_count >
			
 
				+#ifndef NO_TREE_REDUCTION
			
 
				+			!empty
			
 
				+#else
			
 
				+			0
			
 
				+#endif
			
 
				+			)
			
 
				 	{
			
 
				 		/* Temporarily unlock the handle */
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
@@ -144,9 +186,13 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 
				 		 * replicate */
			
 
				 		struct starpu_task *last_replicate_deps[replicate_count];
			
 
				 		memset(last_replicate_deps, 0, replicate_count*sizeof(struct starpu_task *));
			
 
				-	
			
 
				-		unsigned step = 1;
			
 
				-		while (step <= replicate_count)
			
 
				+		struct starpu_task *redux_tasks[replicate_count];
			
 
				+
			
 
				+		/* Redux step-by-step for step from 1 to replicate_count/2, i.e.
			
 
				+		 * 1-by-1, then 2-by-2, then 4-by-4, etc. */
			
 
				+		unsigned step;
			
 
				+		unsigned redux_task_idx = 0;
			
 
				+		for (step = 1; step < replicate_count; step *=2)
			
 
				 		{
			
 
				 			unsigned i;
			
 
				 			for (i = 0; i < replicate_count; i+=2*step)
			
@@ -156,90 +202,108 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 
				 					/* Perform the reduction between replicates i
			
 
				 					 * and i+step and put the result in replicate i */
			
 
				 					struct starpu_task *redux_task = starpu_task_create();
			
 
				-		
			
 
				+
			
 
				+					/* Mark these tasks so that StarPU does not block them
			
 
				+					 * when they try to access the handle (normal tasks are
			
 
				+					 * data requests to that handle are frozen until the
			
 
				+					 * data is coherent again). */
			
 
				+					struct _starpu_job *j = _starpu_get_job_associated_to_task(redux_task);
			
 
				+					j->reduction_task = 1;
			
 
				+
			
 
				 					redux_task->cl = handle->redux_cl;
			
 
				 					STARPU_ASSERT(redux_task->cl);
			
 
				-		
			
 
				-					redux_task->buffers[0].handle = replicate_array[i];
			
 
				-					redux_task->buffers[0].mode = STARPU_RW;
			
 
				-		
			
 
				-					redux_task->buffers[1].handle = replicate_array[i+step];
			
 
				-					redux_task->buffers[1].mode = STARPU_R;
			
 
				-	
			
 
				-					redux_task->detach = 0;
			
 
				-	
			
 
				+
			
 
				+					redux_task->handles[0] = replicate_array[i];
			
 
				+					redux_task->cl->modes[0] = STARPU_RW;
			
 
				+
			
 
				+					redux_task->handles[1] = replicate_array[i+step];
			
 
				+					redux_task->cl->modes[1] = STARPU_R;
			
 
				+
			
 
				 					int ndeps = 0;
			
 
				 					struct starpu_task *task_deps[2];
			
 
				-	
			
 
				+
			
 
				 					if (last_replicate_deps[i])
			
 
				 						task_deps[ndeps++] = last_replicate_deps[i];
			
 
				-	
			
 
				+
			
 
				 					if (last_replicate_deps[i+step])
			
 
				 						task_deps[ndeps++] = last_replicate_deps[i+step];
			
 
				-	
			
 
				+
			
 
				 					/* i depends on this task */
			
 
				 					last_replicate_deps[i] = redux_task;
			
 
				-	
			
 
				+
			
 
				 					/* we don't perform the reduction until both replicates are ready */
			
 
				-					starpu_task_declare_deps_array(redux_task, ndeps, task_deps); 
			
 
				-		
			
 
				-					int ret = _starpu_task_submit_internal(redux_task);
			
 
				-					STARPU_ASSERT(!ret);
			
 
				-		
			
 
				+					starpu_task_declare_deps_array(redux_task, ndeps, task_deps);
			
 
				+
			
 
				+					/* We cannot submit tasks here : we do
			
 
				+					 * not want to depend on tasks that have
			
 
				+					 * been completed, so we juste store
			
 
				+					 * this task : it will be submitted
			
 
				+					 * later. */
			
 
				+					redux_tasks[redux_task_idx++] = redux_task;
			
 
				 				}
			
 
				 			}
			
 
				-
			
 
				-			step *= 2;
			
 
				 		}
			
 
				-	
			
 
				-		struct starpu_task *redux_task = starpu_task_create();
			
 
				-
			
 
				-		/* Mark these tasks so that StarPU does not block them
			
 
				-		 * when they try to access the handle (normal tasks are
			
 
				-		 * data requests to that handle are frozen until the
			
 
				-		 * data is coherent again). */
			
 
				-		starpu_job_t j = _starpu_get_job_associated_to_task(redux_task);
			
 
				-		j->reduction_task = 1;
			
 
				 
			
 
				-		redux_task->cl = handle->redux_cl;
			
 
				-		STARPU_ASSERT(redux_task->cl);
			
 
				+		if (empty)
			
 
				+			/* The handle was empty, we just need to copy the reduced value. */
			
 
				+			_starpu_data_cpy(handle, replicate_array[0], 1, NULL, 0, 1, last_replicate_deps[0]);
			
 
				 
			
 
				-		redux_task->buffers[0].handle = handle;
			
 
				-		redux_task->buffers[0].mode = STARPU_RW;
			
 
				+		/* Let's submit all the reduction tasks. */
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < redux_task_idx; i++)
			
 
				+		{
			
 
				+			int ret = starpu_task_submit(redux_tasks[i]);
			
 
				+			STARPU_ASSERT(ret == 0);
			
 
				+		}
			
 
				+#else
			
 
				+		if (empty) {
			
 
				+			struct starpu_task *redux_task = starpu_task_create();
			
 
				 
			
 
				-		redux_task->buffers[1].handle = replicate_array[0];
			
 
				-		redux_task->buffers[1].mode = STARPU_R;
			
 
				+			/* Mark these tasks so that StarPU does not block them
			
 
				+			 * when they try to access the handle (normal tasks are
			
 
				+			 * data requests to that handle are frozen until the
			
 
				+			 * data is coherent again). */
			
 
				+			struct _starpu_job *j = _starpu_get_job_associated_to_task(redux_task);
			
 
				+			j->reduction_task = 1;
			
 
				 
			
 
				-		if (last_replicate_deps[0])
			
 
				-			starpu_task_declare_deps_array(redux_task, 1, &last_replicate_deps[0]);
			
 
				+			redux_task->cl = handle->init_cl;
			
 
				+			STARPU_ASSERT(redux_task->cl);
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#  warning the mode should already be set in the codelet. Only check they are valid?
			
 
				+#endif
			
 
				+			redux_task->cl->modes[0] = STARPU_W;
			
 
				+			redux_task->handles[0] = handle;
			
 
				 
			
 
				-		int ret = _starpu_task_submit_internal(redux_task);
			
 
				-		STARPU_ASSERT(!ret);
			
 
				+			int ret = starpu_task_submit(redux_task);
			
 
				+			STARPU_ASSERT(!ret);
			
 
				+		}
			
 
				 
			
 
				-#else
			
 
				 		/* Create a set of tasks to perform the reduction */
			
 
				 		unsigned replicate;
			
 
				 		for (replicate = 0; replicate < replicate_count; replicate++)
			
 
				 		{
			
 
				 			struct starpu_task *redux_task = starpu_task_create();
			
 
				-	
			
 
				+
			
 
				 			/* Mark these tasks so that StarPU does not block them
			
 
				 			 * when they try to access the handle (normal tasks are
			
 
				 			 * data requests to that handle are frozen until the
			
 
				 			 * data is coherent again). */
			
 
				-			starpu_job_t j = _starpu_get_job_associated_to_task(redux_task);
			
 
				+			struct _starpu_job *j = _starpu_get_job_associated_to_task(redux_task);
			
 
				 			j->reduction_task = 1;
			
 
				-	
			
 
				+
			
 
				 			redux_task->cl = handle->redux_cl;
			
 
				 			STARPU_ASSERT(redux_task->cl);
			
 
				-	
			
 
				-			redux_task->buffers[0].handle = handle;
			
 
				-			redux_task->buffers[0].mode = STARPU_RW;
			
 
				-	
			
 
				-			redux_task->buffers[1].handle = replicate_array[replicate];
			
 
				-			redux_task->buffers[1].mode = STARPU_R;
			
 
				-	
			
 
				-			int ret = _starpu_task_submit_internal(redux_task);
			
 
				+
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#  warning the modes should already be set in the codelet. Only check they are valid?
			
 
				+#endif
			
 
				+			redux_task->cl->modes[0] = STARPU_RW;
			
 
				+			redux_task->cl->modes[1] = STARPU_R;
			
 
				+
			
 
				+			redux_task->handles[0] = handle;
			
 
				+			redux_task->handles[1] = replicate_array[replicate];
			
 
				+
			
 
				+			int ret = starpu_task_submit(redux_task);
			
 
				 			STARPU_ASSERT(!ret);
			
 
				 		}
			
 
				 #endif
			
@@ -247,17 +311,26 @@ void starpu_data_end_reduction_mode(starpu_data_handle handle)
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				 	}
			
 
				+
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				+	{
			
 
				+		struct _starpu_data_replicate *replicate;
			
 
				+		replicate = &handle->per_worker[worker];
			
 
				+		replicate->relaxed_coherency = 1;
			
 
				+		if (replicate->mc)
			
 
				+			replicate->mc->relaxed_coherency = 1;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-void starpu_data_end_reduction_mode_terminate(starpu_data_handle handle)
			
 
				+void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 
			
 
				-//	fprintf(stderr, "starpu_data_end_reduction_mode_terminate\n");
			
 
				+//	fprintf(stderr, "_starpu_data_end_reduction_mode_terminate\n");
			
 
				 	unsigned worker;
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		struct starpu_data_replicate_s *replicate;
			
 
				+		struct _starpu_data_replicate *replicate;
			
 
				 		replicate = &handle->per_worker[worker];
			
 
				 		replicate->initialized = 0;
			
 
				 
			
--- a/src/datawizard/sort_data_handles.c
+++ b/src/datawizard/sort_data_handles.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -26,10 +26,10 @@
 
				  * them in order, so that we need a total order over data. We must also not
			
 
				  * lock a child before its parent. */
			
 
				 
			
 
				-static void find_data_path(struct starpu_data_state_t *data, unsigned path[])
			
 
				+static void find_data_path(struct _starpu_data_state *data, unsigned path[])
			
 
				 {
			
 
				 	unsigned depth = data->depth;
			
 
				-	struct starpu_data_state_t *current = data;
			
 
				+	struct _starpu_data_state *current = data;
			
 
				 
			
 
				 	/* Compute the path from the root to the data */
			
 
				 	unsigned level; /* level is the distance between the node and the current node */
			
@@ -39,7 +39,7 @@ static void find_data_path(struct starpu_data_state_t *data, unsigned path[])
 
				 		path[depth - level - 1] = current->sibling_index;
			
 
				 		current = data->father_handle;
			
 
				 	}
			
 
				-} 
			
 
				+}
			
 
				 
			
 
				 static int _compar_data_paths(const unsigned pathA[], unsigned depthA,
			
 
				 				const unsigned pathB[], unsigned depthB)
			
@@ -64,12 +64,33 @@ static int _compar_data_paths(const unsigned pathA[], unsigned depthA,
 
				 
			
 
				 /* A comparision function between two handles makes it possible to use qsort to
			
 
				  * sort a list of handles */
			
 
				-static int _starpu_compar_handles(struct starpu_data_state_t *dataA,
			
 
				-				struct starpu_data_state_t *dataB)
			
 
				+static int _starpu_compar_handles(const struct starpu_buffer_descr *descrA,
			
 
				+				  const struct starpu_buffer_descr *descrB)
			
 
				 {
			
 
				+	struct _starpu_data_state *dataA = descrA->handle;
			
 
				+	struct _starpu_data_state *dataB = descrB->handle;
			
 
				+
			
 
				 	/* Perhaps we have the same piece of data */
			
 
				 	if (dataA == dataB)
			
 
				-		return 0;
			
 
				+	{
			
 
				+		/* Process write requests first, this is needed for proper
			
 
				+		 * locking, see _submit_job_enforce_data_deps,
			
 
				+		 * _starpu_fetch_task_input, and _starpu_push_task_output  */
			
 
				+		if (descrA->mode & STARPU_W)
			
 
				+		{
			
 
				+			if (descrB->mode & STARPU_W)
			
 
				+				/* Both A and B write, take the reader first */
			
 
				+				if (descrA->mode & STARPU_R)
			
 
				+					return -1;
			
 
				+				else
			
 
				+					return 1;
			
 
				+			else
			
 
				+				/* Only A writes, take it first */
			
 
				+				return -1;
			
 
				+		} else
			
 
				+			/* A doesn't write, take B before */
			
 
				+			return 1;
			
 
				+	}
			
 
				 
			
 
				 	/* In case we have data/subdata from different trees */
			
 
				 	if (dataA->root_handle != dataB->root_handle)
			
@@ -88,14 +109,14 @@ static int _starpu_compar_handles(struct starpu_data_state_t *dataA,
 
				 
			
 
				 static int _starpu_compar_buffer_descr(const void *_descrA, const void *_descrB)
			
 
				 {
			
 
				-	const starpu_buffer_descr *descrA = (const starpu_buffer_descr *) _descrA;
			
 
				-	const starpu_buffer_descr *descrB = (const starpu_buffer_descr *) _descrB;
			
 
				+	const struct starpu_buffer_descr *descrA = (const struct starpu_buffer_descr *) _descrA;
			
 
				+	const struct starpu_buffer_descr *descrB = (const struct starpu_buffer_descr *) _descrB;
			
 
				 
			
 
				-	return _starpu_compar_handles(descrA->handle, descrB->handle);
			
 
				+	return _starpu_compar_handles(descrA, descrB);
			
 
				 }
			
 
				 
			
 
				 /* The descr array will be overwritten, so this must be a copy ! */
			
 
				-void _starpu_sort_task_handles(starpu_buffer_descr descr[], unsigned nbuffers)
			
 
				+void _starpu_sort_task_handles(struct starpu_buffer_descr descr[], unsigned nbuffers)
			
 
				 {
			
 
				-	qsort(descr, nbuffers, sizeof(starpu_buffer_descr), _starpu_compar_buffer_descr);
			
 
				+	qsort(descr, nbuffers, sizeof(struct starpu_buffer_descr), _starpu_compar_buffer_descr);
			
 
				 }
			
--- a/src/datawizard/sort_data_handles.h
+++ b/src/datawizard/sort_data_handles.h
@@ -29,6 +29,6 @@
 
				 /* To avoid deadlocks, we reorder the different buffers accessed to by the task
			
 
				  * so that we always grab the rw-lock associated to the handles in the same
			
 
				  * order. */
			
 
				-void _starpu_sort_task_handles(starpu_buffer_descr descr[], unsigned nbuffers);
			
 
				+void _starpu_sort_task_handles(struct starpu_buffer_descr descr[], unsigned nbuffers);
			
 
				 
			
 
				 #endif // SORT_DATA_HANDLES
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,16 +22,17 @@
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/write_back.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				+#include <core/sched_policy.h>
			
 
				 
			
 
				 /* Explicitly ask StarPU to allocate room for a piece of data on the specified
			
 
				  * memory node. */
			
 
				-int starpu_data_request_allocation(starpu_data_handle handle, uint32_t node)
			
 
				+int starpu_data_request_allocation(starpu_data_handle_t handle, uint32_t node)
			
 
				 {
			
 
				-	starpu_data_request_t r;
			
 
				+	struct _starpu_data_request *r;
			
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, 0, 0);
			
 
				+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 0);
			
 
				 
			
 
				 	/* we do not increase the refcnt associated to the request since we are
			
 
				 	 * not waiting for its termination */
			
@@ -41,9 +42,10 @@ int starpu_data_request_allocation(starpu_data_handle handle, uint32_t node)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-struct user_interaction_wrapper {
			
 
				-	starpu_data_handle handle;
			
 
				-	starpu_access_mode mode;
			
 
				+struct user_interaction_wrapper
			
 
				+{
			
 
				+	starpu_data_handle_t handle;
			
 
				+	enum starpu_access_mode mode;
			
 
				 	unsigned node;
			
 
				 	pthread_cond_t cond;
			
 
				 	pthread_mutex_t lock;
			
@@ -63,7 +65,7 @@ struct user_interaction_wrapper {
 
				 static void _starpu_data_acquire_fetch_data_callback(void *arg)
			
 
				 {
			
 
				 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
			
 
				-	starpu_data_handle handle = wrapper->handle;
			
 
				+	starpu_data_handle_t handle = wrapper->handle;
			
 
				 
			
 
				 	/* At that moment, the caller holds a reference to the piece of data.
			
 
				 	 * We enqueue the "post" sync task in the list associated to the handle
			
@@ -81,14 +83,14 @@ static void _starpu_data_acquire_continuation_non_blocking(void *arg)
 
				 	int ret;
			
 
				 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
			
 
				 
			
 
				-	starpu_data_handle handle = wrapper->handle;
			
 
				+	starpu_data_handle_t handle = wrapper->handle;
			
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
			
 
				+	struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
			
 
				 
			
 
				-	ret = _starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 1,
			
 
				-			_starpu_data_acquire_fetch_data_callback, wrapper);
			
 
				+	ret = _starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, 1,
			
 
				+					 _starpu_data_acquire_fetch_data_callback, wrapper);
			
 
				 	STARPU_ASSERT(!ret);
			
 
				 }
			
 
				 
			
@@ -108,10 +110,11 @@ static void starpu_data_acquire_cb_pre_sync_callback(void *arg)
 
				 }
			
 
				 
			
 
				 /* The data must be released by calling starpu_data_release later on */
			
 
				-int starpu_data_acquire_cb(starpu_data_handle handle,
			
 
				-		starpu_access_mode mode, void (*callback)(void *), void *arg)
			
 
				+int starpu_data_acquire_cb(starpu_data_handle_t handle,
			
 
				+			   enum starpu_access_mode mode, void (*callback)(void *), void *arg)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				+	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data is not possible");
			
 
				         _STARPU_LOG_IN();
			
 
				 
			
 
				 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) malloc(sizeof(struct user_interaction_wrapper));
			
@@ -121,21 +124,15 @@ int starpu_data_acquire_cb(starpu_data_handle handle,
 
				 	wrapper->mode = mode;
			
 
				 	wrapper->callback = callback;
			
 
				 	wrapper->callback_arg = arg;
			
 
				-	PTHREAD_COND_INIT(&wrapper->cond, NULL);
			
 
				-	PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
			
 
				+	_STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
			
 
				 	wrapper->finished = 0;
			
 
				 
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning TODO instead of having the is_prefetch argument, _starpu_fetch_data shoud consider two flags: async and detached
			
 
				-#endif
			
 
				-	_starpu_spin_lock(&handle->header_lock);
			
 
				-	handle->per_node[0].refcnt++;
			
 
				-	_starpu_spin_unlock(&handle->header_lock);
			
 
				-
			
 
				-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				 	int sequential_consistency = handle->sequential_consistency;
			
 
				 	if (sequential_consistency)
			
 
				 	{
			
 
				+		struct starpu_task *new_task;
			
 
				 		wrapper->pre_sync_task = starpu_task_create();
			
 
				 		wrapper->pre_sync_task->detach = 1;
			
 
				 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
			
@@ -145,21 +142,27 @@ int starpu_data_acquire_cb(starpu_data_handle handle,
 
				 		wrapper->post_sync_task->detach = 1;
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-                starpu_job_t job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
			
 
				+                struct _starpu_job *job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
			
 
				                 job->model_name = "acquire_cb_pre";
			
 
				                 job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
			
 
				                 job->model_name = "acquire_cb_post";
			
 
				 #endif
			
 
				 
			
 
				-		_starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
			
 
				-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				+
			
 
				+		if (new_task) {
			
 
				+			int ret = starpu_task_submit(new_task);
			
 
				+			STARPU_ASSERT(!ret);
			
 
				+		}
			
 
				 
			
 
				 		/* TODO detect if this is superflous */
			
 
				-		int ret = _starpu_task_submit_internal(wrapper->pre_sync_task);
			
 
				+		int ret = starpu_task_submit(wrapper->pre_sync_task);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 	}
			
 
				-	else {
			
 
				-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				 
			
 
				 		starpu_data_acquire_cb_pre_sync_callback(wrapper);
			
 
				 	}
			
@@ -175,33 +178,48 @@ static inline void _starpu_data_acquire_continuation(void *arg)
 
				 {
			
 
				 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
			
 
				 
			
 
				-	starpu_data_handle handle = wrapper->handle;
			
 
				+	starpu_data_handle_t handle = wrapper->handle;
			
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
			
 
				+	struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
			
 
				+
			
 
				+	_starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, 0, NULL, NULL);
			
 
				 
			
 
				-	_starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, NULL, NULL);
			
 
				-	
			
 
				 	/* continuation of starpu_data_acquire */
			
 
				-	PTHREAD_MUTEX_LOCK(&wrapper->lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&wrapper->lock);
			
 
				 	wrapper->finished = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&wrapper->cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&wrapper->cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
			
 
				 }
			
 
				 
			
 
				 /* The data must be released by calling starpu_data_release later on */
			
 
				-int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
			
 
				+int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mode)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				+	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data is not possible");
			
 
				         _STARPU_LOG_IN();
			
 
				 
			
 
				-	/* it is forbidden to call this function from a callback or a codelet */
			
 
				-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) {
			
 
				+	/* unless asynchronous, it is forbidden to call this function from a callback or a codelet */
			
 
				+	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				+	{
			
 
				                 _STARPU_LOG_OUT_TAG("EDEADLK");
			
 
				 		return -EDEADLK;
			
 
				         }
			
 
				 
			
 
				+	if (_starpu_data_is_multiformat_handle(handle) &&
			
 
				+	    _starpu_handle_needs_conversion_task(handle, 0))
			
 
				+	{
			
 
				+		struct starpu_task *task = _starpu_create_conversion_task(handle, 0);
			
 
				+		int ret;
			
 
				+		handle->refcnt--;
			
 
				+		handle->busy_count--;
			
 
				+		handle->mf_node = 0;
			
 
				+		task->synchronous = 1;
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				+	}
			
 
				+
			
 
				 	struct user_interaction_wrapper wrapper =
			
 
				 	{
			
 
				 		.handle = handle,
			
@@ -213,10 +231,11 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 
				 	};
			
 
				 
			
 
				 //	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
			
 
				-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				 	int sequential_consistency = handle->sequential_consistency;
			
 
				 	if (sequential_consistency)
			
 
				 	{
			
 
				+		struct starpu_task *new_task;
			
 
				 		wrapper.pre_sync_task = starpu_task_create();
			
 
				 		wrapper.pre_sync_task->detach = 0;
			
 
				 
			
@@ -224,23 +243,27 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 
				 		wrapper.post_sync_task->detach = 1;
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-                starpu_job_t job = _starpu_get_job_associated_to_task(wrapper.pre_sync_task);
			
 
				+                struct _starpu_job *job = _starpu_get_job_associated_to_task(wrapper.pre_sync_task);
			
 
				                 job->model_name = "acquire_pre";
			
 
				                 job = _starpu_get_job_associated_to_task(wrapper.post_sync_task);
			
 
				                 job->model_name = "acquire_post";
			
 
				 #endif
			
 
				 
			
 
				-		_starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
			
 
				-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				+		if (new_task) {
			
 
				+			int ret = starpu_task_submit(new_task);
			
 
				+			STARPU_ASSERT(!ret);
			
 
				+		}
			
 
				 
			
 
				 		/* TODO detect if this is superflous */
			
 
				 		wrapper.pre_sync_task->synchronous = 1;
			
 
				-		int ret = _starpu_task_submit_internal(wrapper.pre_sync_task);
			
 
				+		int ret = starpu_task_submit(wrapper.pre_sync_task);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				-		//starpu_task_wait(wrapper.pre_sync_task);
			
 
				 	}
			
 
				-	else {
			
 
				-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				 	}
			
 
				 
			
 
				 	/* we try to get the data, if we do not succeed immediately, we set a
			
@@ -249,15 +272,16 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 
				 	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _starpu_data_acquire_continuation, &wrapper))
			
 
				 	{
			
 
				 		/* no one has locked this data yet, so we proceed immediately */
			
 
				-		struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
			
 
				-		int ret = _starpu_fetch_data_on_node(handle, ram_replicate, mode, 0, NULL, NULL);
			
 
				+		struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
			
 
				+		int ret = _starpu_fetch_data_on_node(handle, ram_replicate, mode, 0, 0, NULL, NULL);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 	}
			
 
				-	else {
			
 
				-		PTHREAD_MUTEX_LOCK(&wrapper.lock);
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(&wrapper.lock);
			
 
				 		while (!wrapper.finished)
			
 
				-			PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
			
 
				-		PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
			
 
				+			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
			
 
				 	}
			
 
				 
			
 
				 	/* At that moment, the caller holds a reference to the piece of data.
			
@@ -272,7 +296,7 @@ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
 
				 
			
 
				 /* This function must be called after starpu_data_acquire so that the
			
 
				  * application release the data */
			
 
				-void starpu_data_release(starpu_data_handle handle)
			
 
				+void starpu_data_release(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
@@ -286,71 +310,91 @@ void starpu_data_release(starpu_data_handle handle)
 
				 static void _prefetch_data_on_node(void *arg)
			
 
				 {
			
 
				 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
			
 
				-	starpu_data_handle handle = wrapper->handle;
			
 
				+	starpu_data_handle_t handle = wrapper->handle;
			
 
				         int ret;
			
 
				 
			
 
				-	struct starpu_data_replicate_s *replicate = &handle->per_node[wrapper->node];
			
 
				-	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, wrapper->async, NULL, NULL);
			
 
				+	struct _starpu_data_replicate *replicate = &handle->per_node[wrapper->node];
			
 
				+	ret = _starpu_fetch_data_on_node(handle, replicate, STARPU_R, wrapper->async, wrapper->async, NULL, NULL);
			
 
				         STARPU_ASSERT(!ret);
			
 
				 
			
 
				-        PTHREAD_MUTEX_LOCK(&wrapper->lock);
			
 
				-	wrapper->finished = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&wrapper->cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
			
 
				-
			
 
				-	if (!wrapper->async)
			
 
				-	{
			
 
				-		_starpu_spin_lock(&handle->header_lock);
			
 
				-		_starpu_notify_data_dependencies(handle);
			
 
				-		_starpu_spin_unlock(&handle->header_lock);
			
 
				+	if (wrapper->async)
			
 
				+		free(wrapper);
			
 
				+	else {
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(&wrapper->lock);
			
 
				+		wrapper->finished = 1;
			
 
				+		_STARPU_PTHREAD_COND_SIGNAL(&wrapper->cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
			
 
				 	}
			
 
				 
			
 
				+	_starpu_spin_lock(&handle->header_lock);
			
 
				+	_starpu_notify_data_dependencies(handle);
			
 
				+	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				 static
			
 
				-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode)
			
 
				+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_access_mode mode)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				 	/* it is forbidden to call this function from a callback or a codelet */
			
 
				-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				+	if (STARPU_UNLIKELY(!async && !_starpu_worker_may_perform_blocking_calls()))
			
 
				 		return -EDEADLK;
			
 
				 
			
 
				-	struct user_interaction_wrapper wrapper =
			
 
				-	{
			
 
				-		.handle = handle,
			
 
				-		.node = node,
			
 
				-		.async = async,
			
 
				-		.cond = PTHREAD_COND_INITIALIZER,
			
 
				-		.lock = PTHREAD_MUTEX_INITIALIZER,
			
 
				-		.finished = 0
			
 
				-	};
			
 
				+	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) malloc(sizeof(*wrapper));
			
 
				+
			
 
				+	wrapper->handle = handle;
			
 
				+	wrapper->node = node;
			
 
				+	wrapper->async = async;
			
 
				+	_STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
			
 
				+	wrapper->finished = 0;
			
 
				 
			
 
				-	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, &wrapper))
			
 
				+	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, wrapper))
			
 
				 	{
			
 
				 		/* we can immediately proceed */
			
 
				-		struct starpu_data_replicate_s *replicate = &handle->per_node[node];
			
 
				-		_starpu_fetch_data_on_node(handle, replicate, mode, async, NULL, NULL);
			
 
				+		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				+
			
 
				+		free(wrapper);
			
 
				+
			
 
				+		_starpu_fetch_data_on_node(handle, replicate, mode, async, async, NULL, NULL);
			
 
				 
			
 
				 		/* remove the "lock"/reference */
			
 
				+
			
 
				+		_starpu_spin_lock(&handle->header_lock);
			
 
				+
			
 
				 		if (!async)
			
 
				 		{
			
 
				-			_starpu_spin_lock(&handle->header_lock);
			
 
				-			_starpu_notify_data_dependencies(handle);
			
 
				-			_starpu_spin_unlock(&handle->header_lock);
			
 
				+			/* Release our refcnt, like _starpu_release_data_on_node would do */
			
 
				+			replicate->refcnt--;
			
 
				+			STARPU_ASSERT(replicate->refcnt >= 0);
			
 
				+			STARPU_ASSERT(handle->busy_count > 0);
			
 
				+			handle->busy_count--;
			
 
				+			_starpu_data_check_not_busy(handle);
			
 
				 		}
			
 
				+
			
 
				+		/* In case there was a temporary handle (eg. used for reduction), this
			
 
				+		 * handle may have requested to be destroyed when the data is released
			
 
				+		 * */
			
 
				+		unsigned handle_was_destroyed = handle->lazy_unregister;
			
 
				+
			
 
				+		_starpu_notify_data_dependencies(handle);
			
 
				+
			
 
				+		if (!handle_was_destroyed)
			
 
				+			_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				-	else {
			
 
				-		PTHREAD_MUTEX_LOCK(&wrapper.lock);
			
 
				-		while (!wrapper.finished)
			
 
				-			PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
			
 
				-		PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
			
 
				+	else if (!async)
			
 
				+	{
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(&wrapper->lock);
			
 
				+		while (!wrapper->finished)
			
 
				+			_STARPU_PTHREAD_COND_WAIT(&wrapper->cond, &wrapper->lock);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
			
 
				+		free(wrapper);
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsigned async)
			
 
				+int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 {
			
 
				 	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R);
			
 
				 }
			
@@ -359,7 +403,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsig
 
				  *	It is possible to specify that a piece of data can be discarded without
			
 
				  *	impacting the application.
			
 
				  */
			
 
				-void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_important)
			
 
				+void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important)
			
 
				 {
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
@@ -368,7 +412,7 @@ void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_impo
 
				 	for (child = 0; child < handle->nchildren; child++)
			
 
				 	{
			
 
				 		/* make sure the intermediate children is advised as well */
			
 
				-		struct starpu_data_state_t *child_handle = &handle->children[child];
			
 
				+		struct _starpu_data_state *child_handle = &handle->children[child];
			
 
				 		if (child_handle->nchildren > 0)
			
 
				 			starpu_data_advise_as_important(child_handle, is_important);
			
 
				 	}
			
@@ -380,7 +424,7 @@ void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_impo
 
				 
			
 
				 }
			
 
				 
			
 
				-void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsigned flag)
			
 
				+void starpu_data_set_sequential_consistency_flag(starpu_data_handle_t handle, unsigned flag)
			
 
				 {
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
@@ -388,14 +432,14 @@ void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsi
 
				 	for (child = 0; child < handle->nchildren; child++)
			
 
				 	{
			
 
				 		/* make sure that the flags are applied to the children as well */
			
 
				-		struct starpu_data_state_t *child_handle = &handle->children[child];
			
 
				+		struct _starpu_data_state *child_handle = &handle->children[child];
			
 
				 		if (child_handle->nchildren > 0)
			
 
				 			starpu_data_set_sequential_consistency_flag(child_handle, flag);
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				 	handle->sequential_consistency = flag;
			
 
				-	PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				 
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
@@ -414,7 +458,7 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 
				 }
			
 
				 
			
 
				 /* Query the status of the handle on the specified memory node. */
			
 
				-void starpu_data_query_status(starpu_data_handle handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
			
 
				+void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
			
 
				 {
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning FIXME
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -17,48 +17,64 @@
 
				 
			
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <datawizard/write_back.h>
			
 
				+#include <core/dependencies/data_concurrency.h>
			
 
				 
			
 
				-void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_node, 
			
 
				-					   uint32_t write_through_mask)
			
 
				+static void wt_callback(void *arg)
			
 
				 {
			
 
				-	if ((write_through_mask & ~(1<<requesting_node)) == 0) {
			
 
				+	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
			
 
				+
			
 
				+	_starpu_spin_lock(&handle->header_lock);
			
 
				+	_starpu_notify_data_dependencies(handle);
			
 
				+	_starpu_spin_unlock(&handle->header_lock);
			
 
				+}
			
 
				+
			
 
				+void _starpu_write_through_data(starpu_data_handle_t handle, uint32_t requesting_node,
			
 
				+				uint32_t write_through_mask)
			
 
				+{
			
 
				+	if ((write_through_mask & ~(1<<requesting_node)) == 0)
			
 
				+	{
			
 
				 		/* nothing will be done ... */
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				 	/* first commit all changes onto the nodes specified by the mask */
			
 
				-	uint32_t node;
			
 
				-	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+	uint32_t node, max;
			
 
				+	for (node = 0, max = starpu_memory_nodes_get_count(); node < max; node++)
			
 
				 	{
			
 
				-		if (write_through_mask & (1<<node)) {
			
 
				+		if (write_through_mask & (1<<node))
			
 
				+		{
			
 
				 			/* we need to commit the buffer on that node */
			
 
				-			if (node != requesting_node) 
			
 
				+			if (node != requesting_node)
			
 
				 			{
			
 
				 				while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 					_starpu_datawizard_progress(requesting_node, 1);
			
 
				 
			
 
				-				starpu_data_request_t r;
			
 
				-				r = create_request_to_fetch_data(handle, &handle->per_node[node],
			
 
				-								STARPU_R, 0, NULL, NULL);
			
 
				+				/* We need to keep a Read lock to avoid letting writers corrupt our copy.  */
			
 
				+				STARPU_ASSERT(handle->current_mode != STARPU_REDUX);
			
 
				+				STARPU_ASSERT(handle->current_mode != STARPU_SCRATCH);
			
 
				+				handle->refcnt++;
			
 
				+				handle->busy_count++;
			
 
				+				handle->current_mode = STARPU_R;
			
 
				+
			
 
				+				struct _starpu_data_request *r;
			
 
				+				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
			
 
				+									 STARPU_R, 1, 1, wt_callback, handle);
			
 
				 
			
 
				 			        /* If no request was created, the handle was already up-to-date on the
			
 
				 			         * node */
			
 
				 			        if (r)
			
 
				-				{
			
 
				 				        _starpu_spin_unlock(&handle->header_lock);
			
 
				-        				_starpu_wait_data_request_completion(r, 1);
			
 
				-				}
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_data_set_wt_mask(starpu_data_handle handle, uint32_t wt_mask)
			
 
				+void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask)
			
 
				 {
			
 
				 	handle->wt_mask = wt_mask;
			
 
				 
			
 
				 	/* in case the data has some children, set their wt_mask as well */
			
 
				-	if (handle->nchildren > 0) 
			
 
				+	if (handle->nchildren > 0)
			
 
				 	{
			
 
				 		unsigned child;
			
 
				 		for (child = 0; child < handle->nchildren; child++)
			
--- a/src/datawizard/write_back.h
+++ b/src/datawizard/write_back.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,7 +24,7 @@
 
				 /* If a write-through mask is associated to that data handle, this propagates
			
 
				  * the the current value of the data onto the different memory nodes in the
			
 
				  * write_through_mask. */
			
 
				-void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_node, 
			
 
				+void _starpu_write_through_data(starpu_data_handle_t handle, uint32_t requesting_node,
			
 
				 					   uint32_t write_through_mask);
			
 
				 
			
 
				 #endif // __DW_WRITE_BACK_H__
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
@@ -27,22 +27,19 @@
 
				 #include <core/sched_policy.h>
			
 
				 #include <core/sched_ctx.h>
			
 
				 
			
 
				-static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args, int is_parallel_task, int rank, enum starpu_perf_archtype perf_arch)
			
 
				+static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_args, int is_parallel_task, int rank, enum starpu_perf_archtype perf_arch)
			
 
				 {
			
 
				 	int ret;
			
 
				 	struct timespec codelet_start, codelet_end;
			
 
				 
			
 
				-	unsigned calibrate_model = 0;
			
 
				-	int workerid = cpu_args->workerid;
			
 
				 	struct starpu_task *task = j->task;
			
 
				-	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_codelet *cl = task->cl;
			
 
				 
			
 
				 	STARPU_ASSERT(cl);
			
 
				-	STARPU_ASSERT(cl->cpu_func);
			
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				-		ret = _starpu_fetch_task_input(task, 0);
			
 
				+		ret = _starpu_fetch_task_input(j, 0);
			
 
				 		if (ret != 0)
			
 
				 		{
			
 
				 			/* there was not enough memory so the codelet cannot be executed right now ... */
			
@@ -52,7 +49,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 	}
			
 
				 
			
 
				 	if (is_parallel_task)
			
 
				-		PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
			
 
				+		_STARPU_PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
			
 
				 
			
 
				 	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank);
			
 
				 
			
@@ -60,31 +57,27 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 	 * execute the kernel at all. */
			
 
				 	if ((rank == 0) || (cl->type != STARPU_FORKJOIN))
			
 
				 	{
			
 
				-		if (cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS) {
			
 
				-			cl_func func = cl->cpu_func;
			
 
				-			STARPU_ASSERT(func);
			
 
				-			func(task->interfaces, task->cl_arg);
			
 
				-		}
			
 
				-		else {
			
 
				-			if (cl->cpu_funcs[j->nimpl] != NULL) {
			
 
				-				/* _STARPU_DEBUG("CPU driver : running kernel (%d)\n", j->nimpl); */
			
 
				-				cl_func func = cl->cpu_funcs[j->nimpl];
			
 
				-				STARPU_ASSERT(func);
			
 
				-				func(task->interfaces, task->cl_arg);
			
 
				-			}
			
 
				-		}
			
 
				+		_starpu_cl_func_t func = _starpu_task_get_cpu_nth_implementation(cl, j->nimpl);
			
 
				+		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
			
 
				+			/* bind to parallel worker */
			
 
				+			_starpu_bind_thread_on_cpus(cpu_args->config, _starpu_get_combined_worker_struct(j->combined_workerid));
			
 
				+		STARPU_ASSERT(func);
			
 
				+		func(task->interfaces, task->cl_arg);
			
 
				+		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
			
 
				+			/* rebind to single CPU */
			
 
				+			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid);
			
 
				 	}
			
 
				 
			
 
				-	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);
			
 
				+	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank);
			
 
				 
			
 
				 	if (is_parallel_task)
			
 
				-		PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				+		_STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				 		_starpu_driver_update_job_feedback(j, cpu_args,
			
 
				 				perf_arch, &codelet_start, &codelet_end);
			
 
				-		_starpu_push_task_output(task, 0);
			
 
				+		_starpu_push_task_output(j, 0);
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -92,7 +85,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 
			
 
				 void *_starpu_cpu_worker(void *arg)
			
 
				 {
			
 
				-	struct starpu_worker_s *cpu_arg = (struct starpu_worker_s *) arg;
			
 
				+	struct _starpu_worker *cpu_arg = (struct _starpu_worker *) arg;
			
 
				 	unsigned memnode = cpu_arg->memory_node;
			
 
				 	int workerid = cpu_arg->workerid;
			
 
				 	int devid = cpu_arg->devid;
			
@@ -100,7 +93,7 @@ void *_starpu_cpu_worker(void *arg)
 
				 #ifdef STARPU_USE_FXT
			
 
				 	_starpu_fxt_register_thread(cpu_arg->bindid);
			
 
				 #endif
			
 
				-	STARPU_TRACE_WORKER_INIT_START(STARPU_FUT_CPU_KEY, devid, memnode);
			
 
				+	_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CPU_KEY, devid, memnode);
			
 
				 
			
 
				 	_starpu_bind_thread_on_cpu(cpu_arg->config, cpu_arg->bindid);
			
 
				 
			
@@ -115,15 +108,15 @@ void *_starpu_cpu_worker(void *arg)
 
				 
			
 
				 	cpu_arg->status = STATUS_UNKNOWN;
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_INIT_END
			
 
				+	_STARPU_TRACE_WORKER_INIT_END
			
 
				 
			
 
				         /* tell the main thread that we are ready */
			
 
				-	PTHREAD_MUTEX_LOCK(&cpu_arg->mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&cpu_arg->mutex);
			
 
				 	cpu_arg->worker_is_initialized = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&cpu_arg->ready_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&cpu_arg->mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&cpu_arg->ready_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&cpu_arg->mutex);
			
 
				 
			
 
				-        starpu_job_t j;
			
 
				+        struct _starpu_job *j;
			
 
				 	struct starpu_task *task;
			
 
				 
			
 
				 	int res;
			
@@ -133,42 +126,41 @@ void *_starpu_cpu_worker(void *arg)
 
				 
			
 
				 	while (_starpu_machine_is_running())
			
 
				 	{
			
 
				-		STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 		_starpu_datawizard_progress(memnode, 1);
			
 
				-		STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+		_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
 
				 		/* take the mutex inside pop because it depends what mutex:
			
 
				 		   the one of the local task or the one of one of the strategies */
			
 
				 		task = _starpu_pop_task(cpu_arg);
			
 
				 
			
 
				-                if (!task) 
			
 
				+                if (!task)
			
 
				 		{
			
 
				-			PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				-			if (_starpu_worker_can_block(memnode)){
			
 
				+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+			if (_starpu_worker_can_block(memnode))
			
 
				 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
			
 
				-			}
			
 
				 
			
 
				-			PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 			continue;
			
 
				 		};
			
 
				 
			
 
				-		STARPU_ASSERT(task);
			
 
				 
			
 
				+		STARPU_ASSERT(task);
			
 
				 		j = _starpu_get_job_associated_to_task(task);
			
 
				-	
			
 
				+
			
 
				 		/* can a cpu perform that task ? */
			
 
				-		if (!STARPU_CPU_MAY_PERFORM(j)) 
			
 
				+		if (!_STARPU_CPU_MAY_PERFORM(j))
			
 
				 		{
			
 
				 			/* put it and the end of the queue ... XXX */
			
 
				-			_starpu_push_task(j, 0);
			
 
				+			_starpu_push_task(j);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				 		int rank = 0;
			
 
				 		int is_parallel_task = (j->task_size > 1);
			
 
				 
			
 
				-		enum starpu_perf_archtype perf_arch; 
			
 
				-	
			
 
				+		enum starpu_perf_archtype perf_arch;
			
 
				+
			
 
				 		/* Get the rank in case it is a parallel task */
			
 
				 		if (is_parallel_task)
			
 
				 		{
			
@@ -176,11 +168,11 @@ void *_starpu_cpu_worker(void *arg)
 
				 			STARPU_ASSERT(task != j->task);
			
 
				 			free(task);
			
 
				 
			
 
				-			PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				+			_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				 			rank = j->active_task_alias_count++;
			
 
				-			PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				+			_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				 
			
 
				-			struct starpu_combined_worker_s *combined_worker;
			
 
				+			struct _starpu_combined_worker *combined_worker;
			
 
				 			combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
			
 
				 
			
 
				 			cpu_arg->combined_workerid = j->combined_workerid;
			
@@ -188,7 +180,8 @@ void *_starpu_cpu_worker(void *arg)
 
				 			cpu_arg->current_rank = rank;
			
 
				 			perf_arch = combined_worker->perf_arch;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			cpu_arg->combined_workerid = cpu_arg->workerid;
			
 
				 			cpu_arg->worker_size = 1;
			
 
				 			cpu_arg->current_rank = 0;
			
@@ -196,34 +189,39 @@ void *_starpu_cpu_worker(void *arg)
 
				 		}
			
 
				 
			
 
				 		_starpu_set_current_task(j->task);
			
 
				+		cpu_arg->current_task = j->task;
			
 
				 
			
 
				-		res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
			
 
				+                res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
			
 
				 
			
 
				 		_starpu_set_current_task(NULL);
			
 
				+		cpu_arg->current_task = NULL;
			
 
				 
			
 
				-		if (res) {
			
 
				-			switch (res) {
			
 
				+		if (res)
			
 
				+		{
			
 
				+			switch (res)
			
 
				+			{
			
 
				 				case -EAGAIN:
			
 
				-					_starpu_push_task(j, 0);
			
 
				+					_starpu_push_task(j);
			
 
				 					continue;
			
 
				-				default: 
			
 
				-					assert(0);
			
 
				+				default:
			
 
				+					STARPU_ASSERT(0);
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		if (rank == 0){
			
 
				-			_starpu_handle_job_termination(j, 0, workerid);
			
 
				-		}
			
 
				+		if (rank == 0)
			
 
				+			_starpu_handle_job_termination(j, workerid);
			
 
				         }
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_DEINIT_START
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_START
			
 
				+
			
 
				+	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				 	/* In case there remains some memory that was automatically
			
 
				 	 * allocated by StarPU, we release it now. Note that data
			
 
				 	 * coherency is not maintained anymore at that point ! */
			
 
				 	_starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CPU_KEY);
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CPU_KEY);
			
 
				 
			
 
				 	pthread_exit(NULL);
			
 
				 	return NULL;
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
@@ -33,6 +33,7 @@ static int ncudagpus;
 
				 
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t transfer_streams[STARPU_NMAXWORKERS];
			
 
				+static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
			
 
				 
			
 
				 /* In case we want to cap the amount of memory available on the GPUs by the
			
 
				  * mean of the STARPU_LIMIT_GPU_MEM, we allocate a big buffer when the driver
			
@@ -51,20 +52,17 @@ static void limit_gpu_mem_if_needed(int devid)
 
				 	}
			
 
				 
			
 
				 	/* Find the size of the memory on the device */
			
 
				-	struct cudaDeviceProp prop;
			
 
				-	cures = cudaGetDeviceProperties(&prop, devid);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	size_t totalGlobalMem = prop.totalGlobalMem;
			
 
				+	size_t totalGlobalMem = props[devid].totalGlobalMem;
			
 
				 
			
 
				 	/* How much memory to waste ? */
			
 
				 	size_t to_waste = totalGlobalMem - (size_t)limit*1024*1024;
			
 
				 
			
 
				+	props[devid].totalGlobalMem -= to_waste;
			
 
				+
			
 
				 	_STARPU_DEBUG("CUDA device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
			
 
				 			devid, (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024),
			
 
				 			(size_t)(totalGlobalMem - to_waste)/(1024*1024));
			
 
				-	
			
 
				+
			
 
				 	/* Allocate a large buffer to waste memory and constraint the amount of available memory. */
			
 
				 	cures = cudaMalloc((void **)&wasted_memory[devid], to_waste);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
@@ -85,6 +83,11 @@ static void unlimit_gpu_mem_if_needed(int devid)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+size_t starpu_cuda_get_global_mem_size(int devid)
			
 
				+{
			
 
				+	return (size_t)props[devid].totalGlobalMem;
			
 
				+}
			
 
				+
			
 
				 cudaStream_t starpu_cuda_get_local_transfer_stream(void)
			
 
				 {
			
 
				 	int worker = starpu_worker_get_id();
			
@@ -99,6 +102,13 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 
				 	return streams[worker];
			
 
				 }
			
 
				 
			
 
				+const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
			
 
				+{
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+	unsigned devid = config->workers[workerid].devid;
			
 
				+	return &props[devid];
			
 
				+}
			
 
				+
			
 
				 static void init_context(int devid)
			
 
				 {
			
 
				 	cudaError_t cures;
			
@@ -109,7 +119,24 @@ static void init_context(int devid)
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* force CUDA to initialize the context for real */
			
 
				-	cudaFree(0);
			
 
				+	cures = cudaFree(0);
			
 
				+	if (STARPU_UNLIKELY(cures)) {
			
 
				+		if (cures == cudaErrorDevicesUnavailable) {
			
 
				+			fprintf(stderr,"All CUDA-capable devices are busy or unavailable\n");
			
 
				+			exit(77);
			
 
				+		}
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	cures = cudaGetDeviceProperties(&props[devid], devid);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	if (props[devid].computeMode == cudaComputeModeExclusive) {
			
 
				+		fprintf(stderr, "CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
			
 
				+		STARPU_ASSERT(0);
			
 
				+	}
			
 
				+#endif
			
 
				 
			
 
				 	limit_gpu_mem_if_needed(devid);
			
 
				 
			
@@ -146,7 +173,8 @@ unsigned _starpu_get_cuda_device_count(void)
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		 return 0;
			
 
				 
			
 
				-	if (cnt > STARPU_MAXCUDADEVS) {
			
 
				+	if (cnt > STARPU_MAXCUDADEVS)
			
 
				+	{
			
 
				 		fprintf(stderr, "# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
			
 
				 		cnt = STARPU_MAXCUDADEVS;
			
 
				 	}
			
@@ -156,10 +184,10 @@ unsigned _starpu_get_cuda_device_count(void)
 
				 void _starpu_init_cuda(void)
			
 
				 {
			
 
				 	ncudagpus = _starpu_get_cuda_device_count();
			
 
				-	assert(ncudagpus <= STARPU_MAXCUDADEVS);
			
 
				+	STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
			
 
				 }
			
 
				 
			
 
				-static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
			
 
				+static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
			
 
				 {
			
 
				 	int ret;
			
 
				 	uint32_t mask = 0;
			
@@ -173,16 +201,17 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 	unsigned calibrate_model = 0;
			
 
				 
			
 
				 	STARPU_ASSERT(task);
			
 
				-	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_codelet *cl = task->cl;
			
 
				 	STARPU_ASSERT(cl);
			
 
				 
			
 
				-	if (cl->model && cl->model->benchmarking) 
			
 
				+	if (cl->model && cl->model->benchmarking)
			
 
				 		calibrate_model = 1;
			
 
				 
			
 
				-	ret = _starpu_fetch_task_input(task, mask);
			
 
				-	if (ret != 0) {
			
 
				+	ret = _starpu_fetch_task_input(j, mask);
			
 
				+	if (ret != 0)
			
 
				+	{
			
 
				 		/* there was not enough memory, so the input of
			
 
				-		 * the codelet cannot be fetched ... put the 
			
 
				+		 * the codelet cannot be fetched ... put the
			
 
				 		 * codelet back, and try it later */
			
 
				 		return -EAGAIN;
			
 
				 	}
			
@@ -199,34 +228,26 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 	/* We make sure we do manipulate the proper device */
			
 
				 	cures = cudaSetDevice(args->devid);
			
 
				+	if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 #endif
			
 
				 
			
 
				-	if (cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS) {
			
 
				-		cl_func func = cl->cuda_func;
			
 
				-		STARPU_ASSERT(func);
			
 
				-		func(task->interfaces, task->cl_arg);
			
 
				-	}
			
 
				-	else {
			
 
				-		if (cl->cuda_funcs[j->nimpl] != NULL) {
			
 
				-			/* _STARPU_DEBUG("Cuda driver : running kernel * (%d)\n", j->nimpl); */
			
 
				-			cl_func func = cl->cuda_funcs[j->nimpl];
			
 
				-			STARPU_ASSERT(func);
			
 
				-			func(task->interfaces, task->cl_arg);
			
 
				-		}
			
 
				-	}
			
 
				+	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
			
 
				+	STARPU_ASSERT(func);
			
 
				+	func(task->interfaces, task->cl_arg);
			
 
				 
			
 
				-	_starpu_driver_end_job(args, j, &codelet_end, 0);
			
 
				+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
			
 
				 
			
 
				 	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
			
 
				 
			
 
				-	_starpu_push_task_output(task, mask);
			
 
				+	_starpu_push_task_output(j, mask);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 void *_starpu_cuda_worker(void *arg)
			
 
				 {
			
 
				-	struct starpu_worker_s* args = arg;
			
 
				+	struct _starpu_worker* args = arg;
			
 
				 
			
 
				 	int devid = args->devid;
			
 
				 	int workerid = args->workerid;
			
@@ -235,7 +256,7 @@ void *_starpu_cuda_worker(void *arg)
 
				 #ifdef STARPU_USE_FXT
			
 
				 	_starpu_fxt_register_thread(args->bindid);
			
 
				 #endif
			
 
				-	STARPU_TRACE_WORKER_INIT_START(STARPU_FUT_CUDA_KEY, devid, memnode);
			
 
				+	_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, devid, memnode);
			
 
				 
			
 
				 	_starpu_bind_thread_on_cpu(args->config, args->bindid);
			
 
				 
			
@@ -252,26 +273,31 @@ void *_starpu_cuda_worker(void *arg)
 
				 
			
 
				 	/* get the device's name */
			
 
				 	char devname[128];
			
 
				-	struct cudaDeviceProp prop;
			
 
				-	cudaGetDeviceProperties(&prop, devid);
			
 
				-	strncpy(devname, prop.name, 128);
			
 
				-#if CUDA_VERSION >= 3020
			
 
				-	snprintf(args->name, sizeof(args->name), "CUDA %d (%s %02x:%02x.0)", args->devid, devname, prop.pciBusID, prop.pciDeviceID);
			
 
				+	strncpy(devname, props[devid].name, 128);
			
 
				+	float size = (float) props[devid].totalGlobalMem / (1<<30);
			
 
				+
			
 
				+#ifdef STARPU_HAVE_BUSID
			
 
				+#ifdef STARPU_HAVE_DOMAINID
			
 
				+	if (props[devid].pciDomainID)
			
 
				+		snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB %04x:%02x:%02x.0)", args->devid, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
			
 
				+	else
			
 
				+#endif
			
 
				+		snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB %02x:%02x.0)", args->devid, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
			
 
				 #else
			
 
				-	snprintf(args->name, sizeof(args->name), "CUDA %d (%s)", args->devid, devname);
			
 
				+	snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB)", args->devid, devname, size);
			
 
				 #endif
			
 
				 	snprintf(args->short_name, sizeof(args->short_name), "CUDA %d", args->devid);
			
 
				 	_STARPU_DEBUG("cuda (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_INIT_END
			
 
				+	_STARPU_TRACE_WORKER_INIT_END
			
 
				 
			
 
				 	/* tell the main thread that this one is ready */
			
 
				-	PTHREAD_MUTEX_LOCK(&args->mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
			
 
				 	args->worker_is_initialized = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&args->ready_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&args->mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
			
 
				 
			
 
				-	struct starpu_job_s * j;
			
 
				+	struct _starpu_job * j;
			
 
				 	struct starpu_task *task;
			
 
				 	int res;
			
 
				 
			
@@ -280,60 +306,64 @@ void *_starpu_cuda_worker(void *arg)
 
				 
			
 
				 	while (_starpu_machine_is_running())
			
 
				 	{
			
 
				-		STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 		_starpu_datawizard_progress(memnode, 1);
			
 
				-		STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+		_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
 
				 		task = _starpu_pop_task(args);
			
 
				 
			
 
				 		if (!task) 
			
 
				 		{
			
 
				-			PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 			if (_starpu_worker_can_block(memnode))
			
 
				 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
			
 
				 		  
			
 
				 
			
 
				-			PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 
			
 
				 			continue;
			
 
				 		};
			
 
				 
			
 
				 
			
 
				 		STARPU_ASSERT(task);
			
 
				-
			
 
				 		j = _starpu_get_job_associated_to_task(task);
			
 
				 
			
 
				 		/* can CUDA do that task ? */
			
 
				-		if (!STARPU_CUDA_MAY_PERFORM(j))
			
 
				+		if (!_STARPU_CUDA_MAY_PERFORM(j))
			
 
				 		{
			
 
				 			/* this is neither a cuda or a cublas task */
			
 
				-			_starpu_push_task(j, 0);
			
 
				+			_starpu_push_task(j);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				 		_starpu_set_current_task(task);
			
 
				+		args->current_task = j->task;
			
 
				 
			
 
				 		res = execute_job_on_cuda(j, args);
			
 
				 
			
 
				-
			
 
				 		_starpu_set_current_task(NULL);
			
 
				+		args->current_task = NULL;
			
 
				 
			
 
				-		if (res) {
			
 
				-			switch (res) {
			
 
				+		if (res)
			
 
				+		{
			
 
				+			switch (res)
			
 
				+			{
			
 
				 				case -EAGAIN:
			
 
				 					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
			
 
				-					_starpu_push_task(j, 0);
			
 
				+					_starpu_push_task(j);
			
 
				 					STARPU_ABORT();
			
 
				 					continue;
			
 
				 				default:
			
 
				-					assert(0);
			
 
				+					STARPU_ASSERT(0);
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		_starpu_handle_job_termination(j, 0, workerid);
			
 
				+		_starpu_handle_job_termination(j, workerid);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_DEINIT_START
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_START
			
 
				+
			
 
				+	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				 	/* In case there remains some memory that was automatically
			
 
				 	 * allocated by StarPU, we release it now. Note that data
			
@@ -342,7 +372,7 @@ void *_starpu_cuda_worker(void *arg)
 
				 
			
 
				 	deinit_context(args->workerid, args->devid);
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CUDA_KEY);
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
			
 
				 
			
 
				 	pthread_exit(NULL);
			
 
				 
			
@@ -350,10 +380,11 @@ void *_starpu_cuda_worker(void *arg)
 
				 
			
 
				 }
			
 
				 
			
 
				-void starpu_cublas_report_error(const char *func, cublasStatus status)
			
 
				+void starpu_cublas_report_error(const char *func, const char *file, int line, cublasStatus status)
			
 
				 {
			
 
				 	char *errormsg;
			
 
				-	switch (status) {
			
 
				+	switch (status)
			
 
				+	{
			
 
				 		case CUBLAS_STATUS_SUCCESS:
			
 
				 			errormsg = "success";
			
 
				 			break;
			
@@ -379,13 +410,13 @@ void starpu_cublas_report_error(const char *func, cublasStatus status)
 
				 			errormsg = "unknown error";
			
 
				 			break;
			
 
				 	}
			
 
				-	printf("oops in %s ... %s \n", func, errormsg);
			
 
				-	assert(0);
			
 
				+	printf("oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
			
 
				+	STARPU_ASSERT(0);
			
 
				 }
			
 
				 
			
 
				-void starpu_cuda_report_error(const char *func, cudaError_t status)
			
 
				+void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status)
			
 
				 {
			
 
				 	const char *errormsg = cudaGetErrorString(status);
			
 
				-	printf("oops in %s ... %s \n", func, errormsg);
			
 
				-	assert(0);
			
 
				+	printf("oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
			
 
				+	STARPU_ASSERT(0);
			
 
				 }
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -24,68 +24,74 @@
 
				 #include <core/debug.h>
			
 
				 #include <drivers/driver_common/driver_common.h>
			
 
				 #include <starpu_top.h>
			
 
				+#include <core/sched_policy.h>
			
 
				+#include <top/starpu_top_core.h>
			
 
				 
			
 
				-void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_start, int rank)
			
 
				+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				-	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_codelet *cl = task->cl;
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				-	int starpu_top=starpu_top_status_get();
			
 
				+	int starpu_top=_starpu_top_status_get();
			
 
				 	int workerid = args->workerid;
			
 
				 	unsigned calibrate_model = 0;
			
 
				 
			
 
				 	if (cl->model && cl->model->benchmarking)
			
 
				 		calibrate_model = 1;
			
 
				 
			
 
				+	if (rank == 0)
			
 
				+		_starpu_sched_pre_exec_hook(task);
			
 
				+
			
 
				 	args->status = STATUS_EXECUTING;
			
 
				-	task->status = STARPU_TASK_RUNNING;	
			
 
				+	task->status = STARPU_TASK_RUNNING;
			
 
				 
			
 
				-	if (rank == 0) {
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				 		cl->per_worker_stats[workerid]++;
			
 
				 
			
 
				 		profiling_info = task->profiling_info;
			
 
				-	
			
 
				+
			
 
				 		if ((profiling && profiling_info) || calibrate_model || starpu_top)
			
 
				 		{
			
 
				-			starpu_clock_gettime(codelet_start);
			
 
				+			_starpu_clock_gettime(codelet_start);
			
 
				 			_starpu_worker_register_executing_start_date(workerid, codelet_start);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	if (starpu_top)
			
 
				-		starputop_task_started(task,workerid,codelet_start);
			
 
				+		_starpu_top_task_started(task,workerid,codelet_start);
			
 
				 
			
 
				-	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				+	_STARPU_TRACE_START_CODELET_BODY(j);
			
 
				 }
			
 
				 
			
 
				-void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_end, int rank)
			
 
				+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				-	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_codelet *cl = task->cl;
			
 
				 	struct starpu_task_profiling_info *profiling_info = task->profiling_info;
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				-	int starpu_top=starpu_top_status_get();
			
 
				+	int starpu_top=_starpu_top_status_get();
			
 
				 	int workerid = args->workerid;
			
 
				 	unsigned calibrate_model = 0;
			
 
				-	enum starpu_perf_archtype archtype STARPU_ATTRIBUTE_UNUSED = args->perf_arch;
			
 
				 
			
 
				-	STARPU_TRACE_END_CODELET_BODY(j, archtype);
			
 
				+	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch);
			
 
				 
			
 
				 	if (cl->model && cl->model->benchmarking)
			
 
				 		calibrate_model = 1;
			
 
				 
			
 
				-	if (rank == 0) {
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				 		if ((profiling && profiling_info) || calibrate_model || starpu_top)
			
 
				-			starpu_clock_gettime(codelet_end);
			
 
				+			_starpu_clock_gettime(codelet_end);
			
 
				 	}
			
 
				 
			
 
				 	if (starpu_top)
			
 
				-	  starputop_task_ended(task,workerid,codelet_end);
			
 
				+	  _starpu_top_task_ended(task,workerid,codelet_end);
			
 
				 
			
 
				 	args->status = STATUS_UNKNOWN;
			
 
				 }
			
 
				-void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
			
 
				+void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
			
 
				 					enum starpu_perf_archtype perf_arch,
			
 
				 					struct timespec *codelet_start, struct timespec *codelet_end)
			
 
				 {
			
@@ -93,7 +99,7 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 
				 	struct timespec measured_ts;
			
 
				 	double measured;
			
 
				 	int workerid = worker_args->workerid;
			
 
				-	struct starpu_codelet_t *cl = j->task->cl;
			
 
				+	struct starpu_codelet *cl = j->task->cl;
			
 
				 	int calibrate_model = 0;
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 	int updated = 0;
			
@@ -112,7 +118,7 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 
				 			memcpy(&profiling_info->end_time, codelet_end, sizeof(struct timespec));
			
 
				 
			
 
				 			profiling_info->workerid = workerid;
			
 
				-			
			
 
				+
			
 
				 			_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1,
			
 
				 				profiling_info->used_cycles,
			
 
				 				profiling_info->stall_cycles,
			
@@ -130,9 +136,10 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 
				 	if (!updated)
			
 
				 		_starpu_worker_update_profiling_info_executing(workerid, NULL, 1, 0, 0, 0);
			
 
				 
			
 
				-	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking) {
			
 
				+	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
			
 
				+	{
			
 
				 		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
			
 
				-		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /* Workers may block when there is no work to do at all. We assume that the
			
@@ -141,17 +148,17 @@ void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *m
 
				 {
			
 
				 	struct timespec start_time, end_time;
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_SLEEP_START
			
 
				+	_STARPU_TRACE_WORKER_SLEEP_START
			
 
				 	_starpu_worker_set_status(workerid, STATUS_SLEEPING);
			
 
				 
			
 
				-	starpu_clock_gettime(&start_time);
			
 
				+	_starpu_clock_gettime(&start_time);
			
 
				 	_starpu_worker_register_sleeping_start_date(workerid, &start_time);
			
 
				 
			
 
				-	PTHREAD_COND_WAIT(cond, mutex);
			
 
				+	_STARPU_PTHREAD_COND_WAIT(cond, mutex);
			
 
				 
			
 
				 	_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
			
 
				-	STARPU_TRACE_WORKER_SLEEP_END
			
 
				-	starpu_clock_gettime(&end_time);
			
 
				+	_STARPU_TRACE_WORKER_SLEEP_END
			
 
				+	_starpu_clock_gettime(&end_time);
			
 
				 
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 	if (profiling)
			
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,13 +23,13 @@
 
				 #include <core/jobs.h>
			
 
				 #include <common/utils.h>
			
 
				 
			
 
				-void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j,
			
 
				-		struct timespec *codelet_start, int rank);
			
 
				-void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j,
			
 
				-		struct timespec *codelet_end, int rank);
			
 
				-void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
			
 
				-		enum starpu_perf_archtype perf_arch,
			
 
				-		struct timespec *codelet_start, struct timespec *codelet_end);
			
 
				+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
			
 
				+			      struct timespec *codelet_start, int rank);
			
 
				+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch,
			
 
				+			    struct timespec *codelet_end, int rank);
			
 
				+void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
			
 
				+					enum starpu_perf_archtype perf_arch,
			
 
				+					struct timespec *codelet_start, struct timespec *codelet_end);
			
 
				 
			
 
				 void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex);
			
 
				 
			
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -34,14 +34,15 @@ pthread_t progress_thread;
 
				 pthread_cond_t progress_cond;
			
 
				 pthread_mutex_t progress_mutex;
			
 
				 
			
 
				-struct gordon_task_wrapper_s {
			
 
				+struct gordon_task_wrapper_s
			
 
				+{
			
 
				 	/* who has executed that ? */
			
 
				-	struct starpu_worker_s *worker;
			
 
				+	struct _starpu_worker *worker;
			
 
				 
			
 
				-	struct starpu_job_list_s *list;	/* StarPU */
			
 
				+	struct _starpu_job_list *list;	/* StarPU */
			
 
				 	struct gordon_ppu_job_s *gordon_job; /* gordon*/
			
 
				 
			
 
				-	struct starpu_job_s *j; /* if there is a single task */
			
 
				+	struct _starpu_job *j; /* if there is a single task */
			
 
				 
			
 
				 	/* debug */
			
 
				 	unsigned terminated;
			
@@ -52,18 +53,19 @@ void *gordon_worker_progress(void *arg)
 
				 	_STARPU_DEBUG("gordon_worker_progress\n");
			
 
				 
			
 
				 	/* fix the thread on the correct cpu */
			
 
				-	struct starpu_worker_set_s *gordon_set_arg = arg;
			
 
				-	unsigned prog_thread_bind_id = 
			
 
				+	struct _starpu_worker_set *gordon_set_arg = arg;
			
 
				+	unsigned prog_thread_bind_id =
			
 
				 		(gordon_set_arg->workers[0].bindid + 1)%(gordon_set_arg->config->nhwcores);
			
 
				 	_starpu_bind_thread_on_cpu(gordon_set_arg->config, prog_thread_bind_id);
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 	progress_thread_is_inited = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&progress_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				 
			
 
				-	while (1) {
			
 
				-		/* the Gordon runtime needs to make sure that we poll it 
			
 
				+	while (1)
			
 
				+	{
			
 
				+		/* the Gordon runtime needs to make sure that we poll it
			
 
				 		 * so that we handle jobs that are done */
			
 
				 
			
 
				 		/* wait for one task termination */
			
@@ -78,21 +80,22 @@ void *gordon_worker_progress(void *arg)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *gordon_job, uint32_t memory_node)
			
 
				+static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_job_s *gordon_job, uint32_t memory_node)
			
 
				 {
			
 
				 	unsigned buffer;
			
 
				 	unsigned nin = 0, ninout = 0, nout = 0;
			
 
				 	unsigned in = 0, inout = 0, out = 0;
			
 
				 
			
 
				 	struct starpu_task *task = j->task;
			
 
				-	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_codelet *cl = task->cl;
			
 
				 
			
 
				 	/* if it is non null, the argument buffer is considered
			
 
				  	 * as the first read-only buffer */
			
 
				-	if (task->cl_arg) {
			
 
				+	if (task->cl_arg)
			
 
				+	{
			
 
				 		gordon_job->buffers[in] = (uint64_t)task->cl_arg;
			
 
				 		gordon_job->ss[in].size = (uint32_t)task->cl_arg_size;
			
 
				-		
			
 
				+
			
 
				 		nin++; in++;
			
 
				 	}
			
 
				 
			
@@ -100,10 +103,10 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 
				 	unsigned nbuffers = cl->nbuffers;
			
 
				 	for (buffer = 0; buffer < nbuffers; buffer++)
			
 
				 	{
			
 
				-		struct starpu_buffer_descr_t *descr;
			
 
				-		descr = &task->buffers[buffer];
			
 
				+		enum starpu_access_mode mode = cl->modes[buffer];
			
 
				 
			
 
				-		switch (descr->mode) {
			
 
				+		switch (mode)
			
 
				+		{
			
 
				 			case STARPU_R:
			
 
				 				nin++;
			
 
				 				break;
			
@@ -120,10 +123,10 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 
				 	for (buffer = 0; buffer < nbuffers; buffer++)
			
 
				 	{
			
 
				 		unsigned gordon_buffer;
			
 
				-		struct starpu_buffer_descr_t *descr;
			
 
				-		descr = &task->buffers[buffer];
			
 
				+		enum starpu_access_mode mode = cl->modes[buffer];
			
 
				 
			
 
				-		switch (descr->mode) {
			
 
				+		switch (mode)
			
 
				+		{
			
 
				 			case STARPU_R:
			
 
				 				gordon_buffer = in++;
			
 
				 				break;
			
@@ -136,7 +139,7 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 
				 				break;
			
 
				 		}
			
 
				 
			
 
				-		starpu_data_handle handle = task->buffers[buffer].handle;
			
 
				+		starpu_data_handle_t handle = task->handles[buffer];
			
 
				 
			
 
				 		gordon_job->nalloc = 0;
			
 
				 		gordon_job->nin = nin;
			
@@ -150,9 +153,9 @@ static void starpu_to_gordon_buffers(starpu_job_t j, struct gordon_ppu_job_s *go
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/* we assume the data are already available so that the data interface fields are 
			
 
				+/* we assume the data are already available so that the data interface fields are
			
 
				  * already filled */
			
 
				-static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
			
 
				+static struct gordon_task_wrapper_s *starpu_to_gordon_job(struct _starpu_job *j)
			
 
				 {
			
 
				 	struct gordon_ppu_job_s *gordon_job = gordon_alloc_jobs(1, 0);
			
 
				 	struct gordon_task_wrapper_s *task_wrapper =
			
@@ -162,10 +165,7 @@ static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
 
				 	task_wrapper->j = j;
			
 
				 	task_wrapper->terminated = 0;
			
 
				 
			
 
				-	if (j->task->clgordon_func != STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS)
			
 
				-		gordon_job->index = j->task->cl->gordon_func;
			
 
				-	else
			
 
				-		gordon_job->index = j->task->cl->gordon_funcs[j->nimpl];
			
 
				+	gordon_job->index = _starpu_task_get_gordon_nth_implementation(j->task->cl, j->nimpl);
			
 
				 
			
 
				 	/* we should not hardcore the memory node ... XXX */
			
 
				 	unsigned memory_node = 0;
			
@@ -174,21 +174,21 @@ static struct gordon_task_wrapper_s *starpu_to_gordon_job(starpu_job_t j)
 
				 	return task_wrapper;
			
 
				 }
			
 
				 
			
 
				-static void handle_terminated_job(starpu_job_t j)
			
 
				+static void handle_terminated_job(struct _starpu_job *j)
			
 
				 {
			
 
				-	_starpu_push_task_output(j->task, 0);
			
 
				+	_starpu_push_task_output(j, 0);
			
 
				 	_starpu_handle_job_termination(j, 0);
			
 
				 	starpu_wake_all_blocked_workers();
			
 
				 }
			
 
				 
			
 
				 static void gordon_callback_list_func(void *arg)
			
 
				 {
			
 
				-	struct gordon_task_wrapper_s *task_wrapper = arg; 
			
 
				-	struct starpu_job_list_s *wrapper_list; 
			
 
				+	struct gordon_task_wrapper_s *task_wrapper = arg;
			
 
				+	struct _starpu_job_list *wrapper_list;
			
 
				 
			
 
				 	/* we don't know who will execute that codelet : so we actually defer the
			
 
				  	 * execution of the StarPU codelet and the job termination later */
			
 
				-	struct starpu_worker_s *worker = task_wrapper->worker;
			
 
				+	struct _starpu_worker *worker = task_wrapper->worker;
			
 
				 	STARPU_ASSERT(worker);
			
 
				 
			
 
				 	wrapper_list = task_wrapper->list;
			
@@ -200,12 +200,12 @@ static void gordon_callback_list_func(void *arg)
 
				 	unsigned task_cnt = 0;
			
 
				 
			
 
				 	/* XXX 0 was hardcoded */
			
 
				-	while (!starpu_job_list_empty(wrapper_list))
			
 
				+	while (!_starpu_job_list_empty(wrapper_list))
			
 
				 	{
			
 
				-		starpu_job_t j = starpu_job_list_pop_back(wrapper_list);
			
 
				+		struct _starpu_job *j = _starpu_job_list_pop_back(wrapper_list);
			
 
				 
			
 
				 		struct gordon_ppu_job_s * gordon_task = &task_wrapper->gordon_job[task_cnt];
			
 
				-		struct starpu_perfmodel_t *model = j->task->cl->model;
			
 
				+		struct starpu_perfmodel *model = j->task->cl->model;
			
 
				 		if (model && model->benchmarking)
			
 
				 		{
			
 
				 			double measured = (double)gordon_task->measured;
			
@@ -214,7 +214,7 @@ static void gordon_callback_list_func(void *arg)
 
				 			_starpu_update_perfmodel_history(j, j->task->cl->model, STARPU_GORDON_DEFAULT, cpuid, measured);
			
 
				 		}
			
 
				 
			
 
				-		_starpu_push_task_output(j->task, 0);
			
 
				+		_starpu_push_task_output(j, 0);
			
 
				 		_starpu_handle_job_termination(j, 0);
			
 
				 		//starpu_wake_all_blocked_workers();
			
 
				 
			
@@ -222,7 +222,7 @@ static void gordon_callback_list_func(void *arg)
 
				 	}
			
 
				 
			
 
				 	/* the job list was allocated by the gordon driver itself */
			
 
				-	starpu_job_list_delete(wrapper_list);
			
 
				+	_starpu_job_list_delete(wrapper_list);
			
 
				 
			
 
				 	starpu_wake_all_blocked_workers();
			
 
				 	free(task_wrapper->gordon_job);
			
@@ -232,11 +232,11 @@ static void gordon_callback_list_func(void *arg)
 
				 
			
 
				 static void gordon_callback_func(void *arg)
			
 
				 {
			
 
				-	struct gordon_task_wrapper_s *task_wrapper = arg; 
			
 
				+	struct gordon_task_wrapper_s *task_wrapper = arg;
			
 
				 
			
 
				 	/* we don't know who will execute that codelet : so we actually defer the
			
 
				  	 * execution of the StarPU codelet and the job termination later */
			
 
				-	struct starpu_worker_s *worker = task_wrapper->worker;
			
 
				+	struct _starpu_worker *worker = task_wrapper->worker;
			
 
				 	STARPU_ASSERT(worker);
			
 
				 
			
 
				 	task_wrapper->terminated = 1;
			
@@ -249,17 +249,20 @@ static void gordon_callback_func(void *arg)
 
				 	free(task_wrapper);
			
 
				 }
			
 
				 
			
 
				-int inject_task(starpu_job_t j, struct starpu_worker_s *worker)
			
 
				+int inject_task(struct _starpu_job *j, struct _starpu_worker *worker)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				-	int ret = _starpu_fetch_task_input(task, 0);
			
 
				+	int ret = _starpu_fetch_task_input(j, 0);
			
 
				 
			
 
				-	if (ret != 0) {
			
 
				+	if (ret != 0)
			
 
				+	{
			
 
				 		/* there was not enough memory so the codelet cannot be executed right now ... */
			
 
				 		/* push the codelet back and try another one ... */
			
 
				 		return STARPU_TRYAGAIN;
			
 
				 	}
			
 
				 
			
 
				+	_starpu_sched_pre_exec_hook(task);
			
 
				+
			
 
				 	struct gordon_task_wrapper_s *task_wrapper = starpu_to_gordon_job(j);
			
 
				 
			
 
				 	task_wrapper->worker = worker;
			
@@ -269,31 +272,33 @@ int inject_task(starpu_job_t j, struct starpu_worker_s *worker)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int inject_task_list(struct starpu_job_list_s *list, struct starpu_worker_s *worker)
			
 
				+int inject_task_list(struct _starpu_job_list *list, struct _starpu_worker *worker)
			
 
				 {
			
 
				 	/* first put back all tasks that can not be performed by Gordon */
			
 
				 	unsigned nvalids = 0;
			
 
				 	unsigned ninvalids = 0;
			
 
				-	starpu_job_t j;
			
 
				+	struct _starpu_job *j;
			
 
				 
			
 
				 	// TODO !
			
 
				-//	
			
 
				-//	for (j = starpu_job_list_begin(list); j != starpu_job_list_end(list); j = starpu_job_list_next(j) )
			
 
				+//
			
 
				+//	for (j = _starpu_job_list_begin(list); j != _starpu_job_list_end(list); j = _starpu_job_list_next(j) )
			
 
				 //	{
			
 
				-//		if (!STARPU_GORDON_MAY_PERFORM(j)) {
			
 
				+//		if (!_STARPU_GORDON_MAY_PERFORM(j))
			
 
				+//              {
			
 
				 //			// XXX TODO
			
 
				 //			ninvalids++;
			
 
				 //			assert(0);
			
 
				 //		}
			
 
				-//		else {
			
 
				+//		else
			
 
				+//              {
			
 
				 //			nvalids++;
			
 
				 //		}
			
 
				 //	}
			
 
				 
			
 
				-	nvalids = job_list_size(list);
			
 
				+	nvalids = _job_list_size(list);
			
 
				 //	_STARPU_DEBUG("nvalids %d \n", nvalids);
			
 
				 
			
 
				-	
			
 
				+
			
 
				 
			
 
				 	struct gordon_task_wrapper_s *task_wrapper = malloc(sizeof(struct gordon_task_wrapper_s));
			
 
				 	gordon_job_t *gordon_jobs = gordon_alloc_jobs(nvalids, 0);
			
@@ -303,26 +308,28 @@ int inject_task_list(struct starpu_job_list_s *list, struct starpu_worker_s *wor
 
				 	task_wrapper->j = NULL;
			
 
				 	task_wrapper->terminated = 0;
			
 
				 	task_wrapper->worker = worker;
			
 
				-	
			
 
				+
			
 
				 	unsigned index;
			
 
				-	for (j = starpu_job_list_begin(list), index = 0; j != starpu_job_list_end(list); j = starpu_job_list_next(j), index++)
			
 
				+	for (j = _starpu_job_list_begin(list), index = 0; j != _starpu_job_list_end(list); j = _starpu_job_list_next(j), index++)
			
 
				 	{
			
 
				 		int ret;
			
 
				 
			
 
				 		struct starpu_task *task = j->task;
			
 
				-		ret = _starpu_fetch_task_input(task, 0);
			
 
				+		ret = _starpu_fetch_task_input(j, 0);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 
			
 
				-		gordon_jobs[index].index = task->cl->gordon_func;
			
 
				+		_starpu_sched_pre_exec_hook(task);
			
 
				+
			
 
				+		gordon_jobs[index].index = _starpu_task_get_gordon_nth_implementation(task->cl, j->nimpl);
			
 
				 
			
 
				-		struct starpu_perfmodel_t *model = j->task->cl->model;
			
 
				+		struct starpu_perfmodel *model = j->task->cl->model;
			
 
				 		if (model && model->benchmarking)
			
 
				 			gordon_jobs[index].flags.sampling = 1;
			
 
				 
			
 
				 		/* we should not hardcore the memory node ... XXX */
			
 
				 		unsigned memory_node = 0;
			
 
				 		starpu_to_gordon_buffers(j, &gordon_jobs[index], memory_node);
			
 
				-		
			
 
				+
			
 
				 	}
			
 
				 
			
 
				 	gordon_pushjob(task_wrapper->gordon_job, gordon_callback_list_func, task_wrapper);
			
@@ -330,27 +337,30 @@ int inject_task_list(struct starpu_job_list_s *list, struct starpu_worker_s *wor
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void *gordon_worker_inject(struct starpu_worker_set_s *arg)
			
 
				+void *gordon_worker_inject(struct _starpu_worker_set *arg)
			
 
				 {
			
 
				 
			
 
				-	while(_starpu_machine_is_running()) {
			
 
				-		if (gordon_busy_enough()) {
			
 
				+	while(_starpu_machine_is_running())
			
 
				+	{
			
 
				+		if (gordon_busy_enough())
			
 
				+		{
			
 
				 			/* gordon already has enough work, wait a little TODO */
			
 
				 			_starpu_wait_on_sched_event();
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 #ifndef NOCHAIN
			
 
				 			int ret = 0;
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning we should look into the local job list here !
			
 
				 #endif
			
 
				 
			
 
				-			struct starpu_job_list_s *list = _starpu_pop_every_task();
			
 
				+			struct _starpu_job_list *list = _starpu_pop_every_task();
			
 
				 			/* XXX 0 is hardcoded */
			
 
				 			if (list)
			
 
				 			{
			
 
				 				/* partition lists */
			
 
				-				unsigned size = job_list_size(list);
			
 
				+				unsigned size = _starpu_job_list_size(list);
			
 
				 				unsigned nchunks = (size<2*arg->nworkers)?size:(2*arg->nworkers);
			
 
				 				//unsigned nchunks = (size<arg->nworkers)?size:(arg->nworkers);
			
 
				 
			
@@ -360,20 +370,20 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
				 				unsigned chunk;
			
 
				 				for (chunk = 0; chunk < nchunks; chunk++)
			
 
				 				{
			
 
				-					struct starpu_job_list_s *chunk_list;
			
 
				+					struct _starpu_job_list *chunk_list;
			
 
				 					if (chunk != (nchunks -1))
			
 
				 					{
			
 
				 						/* split the list in 2 parts : list = chunk_list | tail */
			
 
				-						chunk_list = starpu_job_list_new();
			
 
				+						chunk_list = _starpu_job_list_new();
			
 
				 
			
 
				 						/* find the end */
			
 
				 						chunk_list->_head = list->_head;
			
 
				 
			
 
				-						starpu_job_itor_t it_j = starpu_job_list_begin(list);
			
 
				+						struct _starpu_job *it_j = _starpu_job_list_begin(list);
			
 
				 						unsigned ind;
			
 
				 						for (ind = 0; ind < chunksize; ind++)
			
 
				 						{
			
 
				-							it_j = starpu_job_list_next(it_j);
			
 
				+							it_j = _starpu_job_list_next(it_j);
			
 
				 						}
			
 
				 
			
 
				 						/* it_j should be the first element of the new list (tail) */
			
@@ -382,7 +392,8 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
				 						list->_head = it_j;
			
 
				 						it_j->_prev = NULL;
			
 
				 					}
			
 
				-					else {
			
 
				+					else
			
 
				+					{
			
 
				 						/* this is the last chunk */
			
 
				 						chunk_list = list;
			
 
				 					}
			
@@ -390,26 +401,30 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
				 					ret = inject_task_list(chunk_list, &arg->workers[0]);
			
 
				 				}
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				_starpu_wait_on_sched_event();
			
 
				 			}
			
 
				 #else
			
 
				 			/* gordon should accept a little more work */
			
 
				-			starpu_job_t j;
			
 
				+			struct _starpu_job *j;
			
 
				 			j =  _starpu_pop_task();
			
 
				 	//		_STARPU_DEBUG("pop task %p\n", j);
			
 
				-			if (j) {
			
 
				-				if (STARPU_GORDON_MAY_PERFORM(j)) {
			
 
				+			if (j)
			
 
				+			{
			
 
				+				if (_STARPU_GORDON_MAY_PERFORM(j))
			
 
				+				{
			
 
				 					/* inject that task */
			
 
				 					/* XXX we hardcore &arg->workers[0] for now */
			
 
				 					inject_task(j, &arg->workers[0]);
			
 
				 				}
			
 
				-				else {
			
 
				-					_starpu_push_task(j, 0);
			
 
				+				else
			
 
				+				{
			
 
				+					_starpu_push_task(j);
			
 
				 				}
			
 
				 			}
			
 
				 #endif
			
 
				-			
			
 
				+
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -418,12 +433,12 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
				 
			
 
				 void *_starpu_gordon_worker(void *arg)
			
 
				 {
			
 
				-	struct starpu_worker_set_s *gordon_set_arg = arg;
			
 
				+	struct _starpu_worker_set *gordon_set_arg = arg;
			
 
				 
			
 
				 	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid);
			
 
				 
			
 
				 	/* TODO set_local_memory_node per SPU */
			
 
				-	gordon_init(gordon_set_arg->nworkers);	
			
 
				+	gordon_init(gordon_set_arg->nworkers);
			
 
				 
			
 
				 	/* NB: On SPUs, the worker_key is set to NULL since there is no point
			
 
				 	 * in associating the PPU thread with a specific SPU (worker) while
			
@@ -434,7 +449,7 @@ void *_starpu_gordon_worker(void *arg)
 
				 	unsigned spu;
			
 
				 	for (spu = 0; spu < gordon_set_arg->nworkers; spu++)
			
 
				 	{
			
 
				-		struct starpu_worker_s *worker = &gordon_set_arg->workers[spu];
			
 
				+		struct _starpu_worker *worker = &gordon_set_arg->workers[spu];
			
 
				 		snprintf(worker->name, sizeof(worker->name), "SPU %d", worker->id);
			
 
				 		snprintf(worker->short_name, sizeof(worker->short_name), "SPU %d", worker->id);
			
 
				 	}
			
@@ -446,27 +461,29 @@ void *_starpu_gordon_worker(void *arg)
 
				 	 */
			
 
				 
			
 
				 	/* launch the progression thread */
			
 
				-	PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
			
 
				-	PTHREAD_COND_INIT(&progress_cond, NULL);
			
 
				-	
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
			
 
				+	_STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
			
 
				+
			
 
				 	pthread_create(&progress_thread, NULL, gordon_worker_progress, gordon_set_arg);
			
 
				 
			
 
				 	/* wait for the progression thread to be ready */
			
 
				-	PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 	while (!progress_thread_is_inited)
			
 
				-		PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				+		_STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				 
			
 
				 	_STARPU_DEBUG("progress thread is running ... \n");
			
 
				-	
			
 
				+
			
 
				 	/* tell the core that gordon is ready */
			
 
				-	PTHREAD_MUTEX_LOCK(&gordon_set_arg->mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&gordon_set_arg->mutex);
			
 
				 	gordon_set_arg->set_is_initialized = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&gordon_set_arg->ready_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&gordon_set_arg->mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&gordon_set_arg->ready_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&gordon_set_arg->mutex);
			
 
				 
			
 
				 	gordon_worker_inject(gordon_set_arg);
			
 
				 
			
 
				+	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+
			
 
				 	_STARPU_DEBUG("gordon deinit...\n");
			
 
				 	gordon_deinit();
			
 
				 	_STARPU_DEBUG("gordon was deinited\n");
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
@@ -80,6 +80,15 @@ static void unlimit_gpu_mem_if_needed(int devid)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+size_t starpu_opencl_get_global_mem_size(int devid)
			
 
				+{
			
 
				+	cl_ulong totalGlobalMem;
			
 
				+
			
 
				+	/* Request the size of the current device's memory */
			
 
				+	clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(totalGlobalMem), &totalGlobalMem, NULL);
			
 
				+
			
 
				+	return (size_t)totalGlobalMem;
			
 
				+}
			
 
				 
			
 
				 void starpu_opencl_get_context(int devid, cl_context *context)
			
 
				 {
			
@@ -98,14 +107,14 @@ void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
 
				 
			
 
				 void starpu_opencl_get_current_queue(cl_command_queue *queue)
			
 
				 {
			
 
				-	struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				 	STARPU_ASSERT(queue);
			
 
				         *queue = queues[worker->devid];
			
 
				 }
			
 
				 
			
 
				 void starpu_opencl_get_current_context(cl_context *context)
			
 
				 {
			
 
				-	struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				 	STARPU_ASSERT(context);
			
 
				         *context = contexts[worker->devid];
			
 
				 }
			
@@ -114,7 +123,7 @@ cl_int _starpu_opencl_init_context(int devid)
 
				 {
			
 
				 	cl_int err;
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				 
			
 
				         _STARPU_DEBUG("Initialising context for dev %d\n", devid);
			
 
				 
			
@@ -134,7 +143,7 @@ cl_int _starpu_opencl_init_context(int devid)
 
				         transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				 
			
 
				 	limit_gpu_mem_if_needed(devid);
			
 
				 
			
@@ -145,7 +154,7 @@ cl_int _starpu_opencl_deinit_context(int devid)
 
				 {
			
 
				         cl_int err;
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				 
			
 
				         _STARPU_DEBUG("De-initialising context for dev %d\n", devid);
			
 
				 
			
@@ -162,136 +171,168 @@ cl_int _starpu_opencl_deinit_context(int devid)
 
				 
			
 
				         contexts[devid] = NULL;
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				 
			
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags)
			
 
				+cl_int starpu_opencl_allocate_memory(cl_mem *mem, size_t size, cl_mem_flags flags)
			
 
				 {
			
 
				 	cl_int err;
			
 
				-        cl_mem address;
			
 
				-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+        cl_mem memory;
			
 
				+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				 
			
 
				-	address = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
			
 
				+	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
			
 
				 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-        *addr = address;
			
 
				+        *mem = memory;
			
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				+cl_int starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				 {
			
 
				         cl_int err;
			
 
				-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				-        if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+        if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				+	{
			
 
				                 *ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				                 return CL_SUCCESS;
			
 
				         }
			
 
				-        else {
			
 
				-                if (event != NULL) {
			
 
				+        else
			
 
				+	{
			
 
				+                if (event != NULL)
			
 
				+		{
			
 
				                         /* The asynchronous copy has failed, try to copy synchronously */
			
 
				                         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				                 }
			
 
				-                if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				+                if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				+		{
			
 
				                         *ret = 0;
			
 
				                         return CL_SUCCESS;
			
 
				                 }
			
 
				-                else {
			
 
				+                else
			
 
				+		{
			
 
				                         STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                         return err;
			
 
				                 }
			
 
				         }
			
 
				 }
			
 
				 
			
 
				-cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event)
			
 
				+cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
			
 
				 {
			
 
				         cl_int err;
			
 
				-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				+cl_int starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				 {
			
 
				         cl_int err;
			
 
				-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				-        if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+        if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				+	{
			
 
				                 *ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				                 return CL_SUCCESS;
			
 
				         }
			
 
				-        else {
			
 
				+        else
			
 
				+	{
			
 
				                 if (event != NULL)
			
 
				                         /* The asynchronous copy has failed, try to copy synchronously */
			
 
				                         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				-                if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				+                if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				+		{
			
 
				                         *ret = 0;
			
 
				                         return CL_SUCCESS;
			
 
				                 }
			
 
				-                else {
			
 
				+                else
			
 
				+		{
			
 
				                         STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                         return err;
			
 
				                 }
			
 
				         }
			
 
				-
			
 
				-        return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event)
			
 
				+cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
			
 
				 {
			
 
				         cl_int err;
			
 
				-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				 #if 0
			
 
				-cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				+cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
			
 
				                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
			
 
				 {
			
 
				         cl_int err;
			
 
				-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         err = clEnqueueReadBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
			
 
				                                       buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				+cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
			
 
				                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
			
 
				 {
			
 
				         cl_int err;
			
 
				-        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         err = clEnqueueWriteBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
			
 
				                                        buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
			
 
				+        if (event)
			
 
				+                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				         return CL_SUCCESS;
			
@@ -300,9 +341,10 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const si
 
				 
			
 
				 void _starpu_opencl_init(void)
			
 
				 {
			
 
				-	PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				-        if (!init_done) {
			
 
				-                cl_platform_id platform_id[STARPU_OPENCL_PLATFORM_MAX];
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				+        if (!init_done)
			
 
				+	{
			
 
				+                cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
			
 
				                 cl_uint nb_platforms;
			
 
				                 cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
			
 
				                 cl_int err;
			
@@ -311,26 +353,30 @@ void _starpu_opencl_init(void)
 
				                 _STARPU_DEBUG("Initialising OpenCL\n");
			
 
				 
			
 
				                 // Get Platforms
			
 
				-                err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
			
 
				+                err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
			
 
				                 if (err != CL_SUCCESS) nb_platforms=0;
			
 
				                 _STARPU_DEBUG("Platforms detected: %d\n", nb_platforms);
			
 
				 
			
 
				                 // Get devices
			
 
				                 nb_devices = 0;
			
 
				                 {
			
 
				-                        for (i=0; i<nb_platforms; i++) {
			
 
				+                        for (i=0; i<nb_platforms; i++)
			
 
				+			{
			
 
				                                 cl_uint num;
			
 
				 				int platform_valid = 1;
			
 
				 				char name[1024], vendor[1024];
			
 
				 
			
 
				 				err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
			
 
				-				if (err != CL_SUCCESS) {
			
 
				+				if (err != CL_SUCCESS)
			
 
				+				{
			
 
				 					STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
			
 
				 					platform_valid = 0;
			
 
				 				}
			
 
				-				else {
			
 
				+				else
			
 
				+				{
			
 
				 					err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
			
 
				-					if (err != CL_SUCCESS) {
			
 
				+					if (err != CL_SUCCESS)
			
 
				+					{
			
 
				 						STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
			
 
				 						platform_valid = 0;
			
 
				 					}
			
@@ -341,12 +387,15 @@ void _starpu_opencl_init(void)
 
				 				else
			
 
				 					_STARPU_DEBUG("Platform invalid\n");
			
 
				 #endif
			
 
				-				if (platform_valid) {
			
 
				+				if (platform_valid)
			
 
				+				{
			
 
				 					err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
			
 
				-					if (err == CL_DEVICE_NOT_FOUND) {
			
 
				+					if (err == CL_DEVICE_NOT_FOUND)
			
 
				+					{
			
 
				 						_STARPU_DEBUG("  No devices detected on this platform\n");
			
 
				 					}
			
 
				-					else {
			
 
				+					else
			
 
				+					{
			
 
				 						if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 						_STARPU_DEBUG("  %d devices detected\n", num);
			
 
				 						nb_devices += num;
			
@@ -358,13 +407,15 @@ void _starpu_opencl_init(void)
 
				                 // Get location of OpenCl kernel source files
			
 
				                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
			
 
				 
			
 
				-		if (nb_devices > STARPU_MAXOPENCLDEVS) {
			
 
				-			_STARPU_DISP("# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
			
 
				+		if (nb_devices > STARPU_MAXOPENCLDEVS)
			
 
				+		{
			
 
				+			_STARPU_DISP("# Warning: %u OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
			
 
				 			nb_devices = STARPU_MAXOPENCLDEVS;
			
 
				 		}
			
 
				 
			
 
				                 // initialise internal structures
			
 
				-                for(i=0 ; i<nb_devices ; i++) {
			
 
				+                for(i=0 ; i<nb_devices ; i++)
			
 
				+		{
			
 
				                         contexts[i] = NULL;
			
 
				                         queues[i] = NULL;
			
 
				                         transfer_queues[i] = NULL;
			
@@ -372,15 +423,15 @@ void _starpu_opencl_init(void)
 
				 
			
 
				                 init_done=1;
			
 
				         }
			
 
				-	PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				 }
			
 
				 
			
 
				 static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
			
 
				-static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args);
			
 
				+static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args);
			
 
				 
			
 
				 void *_starpu_opencl_worker(void *arg)
			
 
				 {
			
 
				-	struct starpu_worker_s* args = arg;
			
 
				+	struct _starpu_worker* args = arg;
			
 
				 
			
 
				 	int devid = args->devid;
			
 
				 	int workerid = args->workerid;
			
@@ -390,7 +441,7 @@ void *_starpu_opencl_worker(void *arg)
 
				 #endif
			
 
				 
			
 
				 	unsigned memnode = args->memory_node;
			
 
				-	STARPU_TRACE_WORKER_INIT_START(STARPU_FUT_OPENCL_KEY, devid, memnode);
			
 
				+	_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_OPENCL_KEY, devid, memnode);
			
 
				 
			
 
				 	_starpu_bind_thread_on_cpu(args->config, args->bindid);
			
 
				 
			
@@ -413,77 +464,79 @@ void *_starpu_opencl_worker(void *arg)
 
				 
			
 
				 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_INIT_END
			
 
				+	_STARPU_TRACE_WORKER_INIT_END
			
 
				 
			
 
				 	/* tell the main thread that this one is ready */
			
 
				-	PTHREAD_MUTEX_LOCK(&args->mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
			
 
				 	args->worker_is_initialized = 1;
			
 
				-	PTHREAD_COND_SIGNAL(&args->ready_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&args->mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
			
 
				 
			
 
				-	struct starpu_job_s * j;
			
 
				+	struct _starpu_job * j;
			
 
				 	struct starpu_task *task;
			
 
				 	int res;
			
 
				 
			
 
				 	pthread_cond_t *sched_cond = &args->sched_cond;
			
 
				-        pthread_mutex_t *sched_mutex = &args->sched_mutex;
			
 
				+    pthread_mutex_t *sched_mutex = &args->sched_mutex;
			
 
				 
			
 
				 	while (_starpu_machine_is_running())
			
 
				 	{
			
 
				-		STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 		_starpu_datawizard_progress(memnode, 1);
			
 
				-		STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+		_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
 
				 		task = _starpu_pop_task(args);
			
 
				 		
			
 
				 		if (task == NULL) 
			
 
				 		{
			
 
				-			PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 			if (_starpu_worker_can_block(memnode))
			
 
				 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
			
 
				 
			
 
				-			PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 
			
 
				 			continue;
			
 
				 		};
			
 
				 
			
 
				-		PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				-
			
 
				 		STARPU_ASSERT(task);
			
 
				 		j = _starpu_get_job_associated_to_task(task);
			
 
				 
			
 
				 		/* can OpenCL do that task ? */
			
 
				-		if (!STARPU_OPENCL_MAY_PERFORM(j))
			
 
				+		if (!_STARPU_OPENCL_MAY_PERFORM(j))
			
 
				 		{
			
 
				 			/* this is not a OpenCL task */
			
 
				-			_starpu_push_task(j, 0);
			
 
				+			_starpu_push_task(j);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				 		_starpu_set_current_task(j->task);
			
 
				+		args->current_task = j->task;
			
 
				 
			
 
				 		res = _starpu_opencl_execute_job(j, args);
			
 
				 
			
 
				-
			
 
				-
			
 
				 		_starpu_set_current_task(NULL);
			
 
				+		args->current_task = NULL;
			
 
				 
			
 
				-                if (res) {
			
 
				-			switch (res) {
			
 
				+                if (res)
			
 
				+		{
			
 
				+			switch (res)
			
 
				+			{
			
 
				 				case -EAGAIN:
			
 
				 					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
			
 
				-					_starpu_push_task(j, 0);
			
 
				+					_starpu_push_task(j);
			
 
				 					STARPU_ABORT();
			
 
				 					continue;
			
 
				 				default:
			
 
				-					assert(0);
			
 
				+					STARPU_ASSERT(0);
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		_starpu_handle_job_termination(j, 0, workerid);
			
 
				+		_starpu_handle_job_termination(j, workerid);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_WORKER_DEINIT_START
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_START
			
 
				+
			
 
				+	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				         _starpu_opencl_deinit_context(devid);
			
 
				 
			
@@ -496,7 +549,8 @@ static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
 
				 {
			
 
				 	int err;
			
 
				 
			
 
				-        if (!init_done) {
			
 
				+        if (!init_done)
			
 
				+	{
			
 
				                 _starpu_opencl_init();
			
 
				         }
			
 
				 
			
@@ -510,13 +564,14 @@ static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
 
				 
			
 
				 unsigned _starpu_opencl_get_device_count(void)
			
 
				 {
			
 
				-        if (!init_done) {
			
 
				+        if (!init_done)
			
 
				+	{
			
 
				                 _starpu_opencl_init();
			
 
				         }
			
 
				 	return nb_devices;
			
 
				 }
			
 
				 
			
 
				-static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args)
			
 
				+static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
			
 
				 {
			
 
				 	int ret;
			
 
				 	uint32_t mask = 0;
			
@@ -527,11 +582,12 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 	struct timespec codelet_start, codelet_end;
			
 
				 
			
 
				 	STARPU_ASSERT(task);
			
 
				-	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_codelet *cl = task->cl;
			
 
				 	STARPU_ASSERT(cl);
			
 
				 
			
 
				-	ret = _starpu_fetch_task_input(task, mask);
			
 
				-	if (ret != 0) {
			
 
				+	ret = _starpu_fetch_task_input(j, mask);
			
 
				+	if (ret != 0)
			
 
				+	{
			
 
				 		/* there was not enough memory, so the input of
			
 
				 		 * the codelet cannot be fetched ... put the
			
 
				 		 * codelet back, and try it later */
			
@@ -540,26 +596,16 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 
			
 
				 	_starpu_driver_start_job(args, j, &codelet_start, 0);
			
 
				 
			
 
				-	if (cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS) {
			
 
				-		cl_func func = cl->opencl_func;
			
 
				-		STARPU_ASSERT(func);
			
 
				-		func(task->interfaces, task->cl_arg);
			
 
				-	}
			
 
				-	else {
			
 
				-		if (cl->opencl_funcs[j->nimpl] != NULL) {
			
 
				-			/* _STARPU_DEBUG("OpenCL driver : running kernel (%d)\n", j->nimpl); */
			
 
				-			cl_func func = cl->opencl_funcs[j->nimpl];
			
 
				-			STARPU_ASSERT(func);
			
 
				-			func(task->interfaces, task->cl_arg);
			
 
				-		}
			
 
				-	}
			
 
				+	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
			
 
				+	STARPU_ASSERT(func);
			
 
				+	func(task->interfaces, task->cl_arg);
			
 
				 
			
 
				-	_starpu_driver_end_job(args, j, &codelet_end, 0);
			
 
				+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
			
 
				 
			
 
				 	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
			
 
				 							&codelet_start, &codelet_end);
			
 
				 
			
 
				-	_starpu_push_task_output(task, mask);
			
 
				+	_starpu_push_task_output(j, mask);
			
 
				 
			
 
				 	return EXIT_SUCCESS;
			
 
				 }
			
--- a/src/drivers/opencl/driver_opencl.h
+++ b/src/drivers/opencl/driver_opencl.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -39,29 +39,15 @@ int _starpu_opencl_deinit_context(int devid);
 
				 extern
			
 
				 unsigned _starpu_opencl_get_device_count(void);
			
 
				 
			
 
				-extern
			
 
				-cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
			
 
				-
			
 
				-extern
			
 
				-cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event);
			
 
				-
			
 
				-extern
			
 
				-cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event);
			
 
				-
			
 
				-extern
			
 
				-cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event, int *ret);
			
 
				-
			
 
				-extern
			
 
				-cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event, int *ret);
			
 
				 
			
 
				 #if 0
			
 
				 extern
			
 
				-cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				+cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
			
 
				                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
			
 
				 
			
 
				 extern
			
 
				-cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				+cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, const size_t buffer_origin[3], const size_t host_origin[3],
			
 
				                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
			
 
				                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
			
 
				 #endif
			
--- a/src/drivers/opencl/driver_opencl_utils.c
+++ b/src/drivers/opencl/driver_opencl_utils.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -39,27 +39,56 @@ char *_starpu_opencl_program_dir;
 
				 #define _STARPU_STRINGIFY(x) _STARPU_STRINGIFY_(x)
			
 
				 
			
 
				 static
			
 
				-int _starpu_opencl_locate_file(const char *source_file_name, char *located_file_name) {
			
 
				+int _starpu_opencl_locate_file(const char *source_file_name, char *located_file_name, char *located_dir_name)
			
 
				+{
			
 
				+	int ret = EXIT_FAILURE;
			
 
				+
			
 
				         _STARPU_DEBUG("Trying to locate <%s>\n", source_file_name);
			
 
				-        if (access(source_file_name, R_OK) == 0) {
			
 
				+        if (access(source_file_name, R_OK) == 0)
			
 
				+	{
			
 
				                 strcpy(located_file_name, source_file_name);
			
 
				-                return EXIT_SUCCESS;
			
 
				+		ret = EXIT_SUCCESS;
			
 
				         }
			
 
				-        if (_starpu_opencl_program_dir) {
			
 
				-                sprintf(located_file_name, "%s/%s", _starpu_opencl_program_dir, source_file_name);
			
 
				-                _STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				-                if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
			
 
				-        }
			
 
				-        sprintf(located_file_name, "%s/%s", _STARPU_STRINGIFY(STARPU_OPENCL_DATADIR), source_file_name);
			
 
				-        _STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				-        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
			
 
				-        sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
			
 
				-        _STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				-        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
			
 
				-
			
 
				-        strcpy(located_file_name, "");
			
 
				-        _STARPU_ERROR("Cannot locate file <%s>\n", source_file_name);
			
 
				-        return EXIT_FAILURE;
			
 
				+
			
 
				+	if (ret == EXIT_FAILURE && _starpu_opencl_program_dir)
			
 
				+	{
			
 
				+		sprintf(located_file_name, "%s/%s", _starpu_opencl_program_dir, source_file_name);
			
 
				+		_STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				+		if (access(located_file_name, R_OK) == 0) ret = EXIT_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	if (ret == EXIT_FAILURE)
			
 
				+	{
			
 
				+		sprintf(located_file_name, "%s/%s", _STARPU_STRINGIFY(STARPU_OPENCL_DATADIR), source_file_name);
			
 
				+		_STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				+		if (access(located_file_name, R_OK) == 0) ret = EXIT_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	if (ret == EXIT_FAILURE)
			
 
				+	{
			
 
				+		sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
			
 
				+		_STARPU_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				+		if (access(located_file_name, R_OK) == 0) ret = EXIT_SUCCESS;
			
 
				+	}
			
 
				+
			
 
				+	if (ret == EXIT_FAILURE)
			
 
				+	{
			
 
				+		strcpy(located_file_name, "");
			
 
				+		strcpy(located_dir_name, "");
			
 
				+		_STARPU_ERROR("Cannot locate file <%s>\n", source_file_name);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		char *last = strrchr(located_file_name, '/');
			
 
				+		if (!last) strcpy(located_dir_name, "");
			
 
				+		else
			
 
				+		{
			
 
				+			sprintf(located_dir_name, "%s", located_file_name);
			
 
				+			located_dir_name[strlen(located_file_name)-strlen(last)+1] = '\0';
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+        return ret;
			
 
				 }
			
 
				 
			
 
				 cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs,
			
@@ -75,7 +104,8 @@ cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, str
 
				         starpu_opencl_get_queue(devid, queue);
			
 
				 
			
 
				         program = opencl_programs->programs[devid];
			
 
				-        if (!program) {
			
 
				+        if (!program)
			
 
				+	{
			
 
				                 _STARPU_DISP("Program not available\n");
			
 
				                 return CL_INVALID_PROGRAM;
			
 
				         }
			
@@ -87,7 +117,8 @@ cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, str
 
				 	return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-cl_int starpu_opencl_release_kernel(cl_kernel kernel) {
			
 
				+cl_int starpu_opencl_release_kernel(cl_kernel kernel)
			
 
				+{
			
 
				 	cl_int err;
			
 
				 
			
 
				 	err = clReleaseKernel(kernel);
			
@@ -106,14 +137,15 @@ char *_starpu_opencl_load_program_source(const char *filename)
 
				         char        c;
			
 
				 
			
 
				         fh = fopen(filename, "r");
			
 
				-        if (fh == 0)
			
 
				+        if (!fh)
			
 
				                 return NULL;
			
 
				 
			
 
				         stat(filename, &statbuf);
			
 
				         source = (char *) malloc(statbuf.st_size + 1);
			
 
				 
			
 
				-        for(c=fgetc(fh), x=0 ; c != EOF ; c = fgetc(fh), x++) {
			
 
				-          source[x] = c;
			
 
				+        for(c=(char)fgetc(fh), x=0 ; c != EOF ; c =(char)fgetc(fh), x++)
			
 
				+	{
			
 
				+		source[x] = c;
			
 
				         }
			
 
				         source[x] = '\0';
			
 
				 
			
@@ -133,30 +165,34 @@ int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, str
 
				 
			
 
				         nb_devices = _starpu_opencl_get_device_count();
			
 
				         // Iterate over each device
			
 
				-        for(dev = 0; dev < nb_devices; dev ++) {
			
 
				+        for(dev = 0; dev < nb_devices; dev ++)
			
 
				+	{
			
 
				                 cl_device_id device;
			
 
				                 cl_context   context;
			
 
				                 cl_program   program;
			
 
				                 cl_int       err;
			
 
				 
			
 
				+                opencl_programs->programs[dev] = NULL;
			
 
				+
			
 
				                 starpu_opencl_get_device(dev, &device);
			
 
				                 starpu_opencl_get_context(dev, &context);
			
 
				-                if (context == NULL) {
			
 
				+                if (context == NULL)
			
 
				+		{
			
 
				                         _STARPU_DEBUG("[%d] is not a valid OpenCL context\n", dev);
			
 
				                         continue;
			
 
				                 }
			
 
				 
			
 
				-                opencl_programs->programs[dev] = NULL;
			
 
				-
			
 
				-                if (context == NULL) continue;
			
 
				-
			
 
				                 // Create the compute program from the source buffer
			
 
				                 program = clCreateProgramWithSource(context, 1, (const char **) &opencl_program_source, NULL, &err);
			
 
				-                if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                if (!program || err != CL_SUCCESS) {
			
 
				+			_STARPU_DISP("Error: Failed to load program source!\n");
			
 
				+			return EXIT_FAILURE;
			
 
				+		}
			
 
				 
			
 
				                 // Build the program executable
			
 
				                 err = clBuildProgram(program, 1, &device, build_options, NULL, NULL);
			
 
				-                if (err != CL_SUCCESS) {
			
 
				+                if (err != CL_SUCCESS)
			
 
				+		{
			
 
				                         size_t len;
			
 
				                         static char buffer[4096];
			
 
				 
			
@@ -178,21 +214,32 @@ int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct sta
 
				 {
			
 
				 	int nb_devices;
			
 
				         char located_file_name[1024];
			
 
				+        char located_dir_name[1024];
			
 
				+	char new_build_options[1024];
			
 
				 
			
 
				 	// Do not try to load and compile the file if there is no devices
			
 
				-	nb_devices = _starpu_opencl_get_device_count();
			
 
				+	nb_devices = starpu_opencl_worker_get_count();
			
 
				 	if (nb_devices == 0) return EXIT_SUCCESS;
			
 
				 
			
 
				         // Locate source file
			
 
				-        _starpu_opencl_locate_file(source_file_name, located_file_name);
			
 
				+        _starpu_opencl_locate_file(source_file_name, located_file_name, located_dir_name);
			
 
				         _STARPU_DEBUG("Source file name : <%s>\n", located_file_name);
			
 
				+        _STARPU_DEBUG("Source directory name : <%s>\n", located_dir_name);
			
 
				 
			
 
				         // Load the compute program from disk into a cstring buffer
			
 
				         char *opencl_program_source = _starpu_opencl_load_program_source(located_file_name);
			
 
				         if(!opencl_program_source)
			
 
				                 _STARPU_ERROR("Failed to load compute program from file <%s>!\n", located_file_name);
			
 
				 
			
 
				-        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs, build_options);
			
 
				+	if (!strcmp(located_dir_name, ""))
			
 
				+		strcpy(new_build_options, build_options);
			
 
				+	else if (build_options)
			
 
				+		sprintf(new_build_options, "-I %s %s", located_dir_name, build_options);
			
 
				+	else
			
 
				+		sprintf(new_build_options, "-I %s", located_dir_name);
			
 
				+	_STARPU_DEBUG("Build options: <%s>\n", new_build_options);
			
 
				+
			
 
				+        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs, new_build_options);
			
 
				 }
			
 
				 
			
 
				 cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
			
@@ -200,9 +247,13 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
 
				         unsigned int dev;
			
 
				         unsigned int nb_devices;
			
 
				 
			
 
				+	if (!starpu_opencl_worker_get_count())
			
 
				+		return CL_SUCCESS;
			
 
				+
			
 
				         nb_devices = _starpu_opencl_get_device_count();
			
 
				         // Iterate over each device
			
 
				-        for(dev = 0; dev < nb_devices; dev ++) {
			
 
				+        for(dev = 0; dev < nb_devices; dev ++)
			
 
				+	{
			
 
				                 if (opencl_programs->programs[dev])
			
 
				                         clReleaseProgram(opencl_programs->programs[dev]);
			
 
				         }
			
@@ -212,12 +263,13 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
 
				 int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #if defined(CL_PROFILING_CLOCK_CYCLE_COUNT)||defined(CL_PROFILING_STALL_CYCLE_COUNT)||defined(CL_PROFILING_POWER_CONSUMED)
			
 
				-	struct starpu_task *task = starpu_get_current_task();
			
 
				+	struct starpu_task *task = starpu_task_get_current();
			
 
				 	struct starpu_task_profiling_info *info = task->profiling_info;
			
 
				 #endif
			
 
				 
			
 
				 #ifdef CL_PROFILING_CLOCK_CYCLE_COUNT
			
 
				-	if (starpu_profiling_status_get() && info) {
			
 
				+	if (starpu_profiling_status_get() && info)
			
 
				+	{
			
 
				 		cl_int err;
			
 
				 		unsigned int clock_cycle_count;
			
 
				 		size_t size;
			
@@ -228,7 +280,8 @@ int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 
				 	}
			
 
				 #endif
			
 
				 #ifdef CL_PROFILING_STALL_CYCLE_COUNT
			
 
				-	if (starpu_profiling_status_get() && info) {
			
 
				+	if (starpu_profiling_status_get() && info)
			
 
				+	{
			
 
				 		cl_int err;
			
 
				 		unsigned int stall_cycle_count;
			
 
				 		size_t size;
			
@@ -240,7 +293,8 @@ int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 
				 	}
			
 
				 #endif
			
 
				 #ifdef CL_PROFILING_POWER_CONSUMED
			
 
				-	if (info && (starpu_profiling_status_get() || (task->cl && task->cl->power_model && task->cl->power_model->benchmarking))) {
			
 
				+	if (info && (starpu_profiling_status_get() || (task->cl && task->cl->power_model && task->cl->power_model->benchmarking)))
			
 
				+	{
			
 
				 		cl_int err;
			
 
				 		double power_consumed;
			
 
				 		size_t size;
			
@@ -255,10 +309,11 @@ int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void starpu_opencl_display_error(const char *func, const char* msg, cl_int status)
			
 
				+void starpu_opencl_display_error(const char *func, const char *file, int line, const char* msg, cl_int status)
			
 
				 {
			
 
				 	const char *errormsg;
			
 
				-	switch (status) {
			
 
				+	switch (status)
			
 
				+	{
			
 
				 	case CL_SUCCESS:
			
 
				 		errormsg = "success";
			
 
				 		break;
			
@@ -407,8 +462,34 @@ void starpu_opencl_display_error(const char *func, const char* msg, cl_int statu
 
				 		break;
			
 
				 	}
			
 
				 	if (msg)
			
 
				-		printf("oops in %s (%s) ... <%s> (%d) \n", func, msg, errormsg, status);
			
 
				+		printf("oops in %s (%s:%d) (%s) ... <%s> (%d) \n", func, file, line, msg, errormsg, status);
			
 
				 	else
			
 
				-		printf("oops in %s ... <%s> (%d) \n", func, errormsg, status);
			
 
				+		printf("oops in %s (%s:%d) ... <%s> (%d) \n", func, file, line, errormsg, status);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+int starpu_opencl_set_kernel_args(cl_int *error, cl_kernel *kernel, ...)
			
 
				+{
			
 
				+	int i;
			
 
				+	va_list ap;
			
 
				+
			
 
				+	va_start(ap, kernel);
			
 
				+
			
 
				+	for (i = 0; ; i++)
			
 
				+	{
			
 
				+		int size = va_arg(ap, int);
			
 
				+		if (size == 0)
			
 
				+			break;
			
 
				+
			
 
				+		cl_mem *ptr = va_arg(ap, cl_mem *);
			
 
				+		int err = clSetKernelArg(*kernel, i, size, ptr);
			
 
				+		if (err != CL_SUCCESS)
			
 
				+		{
			
 
				+			*error = err;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				+	va_end(ap);
			
 
				+	return i;
			
 
				 }
			
--- a/src/drivers/opencl/driver_opencl_utils.h
+++ b/src/drivers/opencl/driver_opencl_utils.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,6 +19,6 @@
 
				 
			
 
				 #include <config.h>
			
 
				 
			
 
				-#define STARPU_OPENCL_PLATFORM_MAX 4
			
 
				+#define _STARPU_OPENCL_PLATFORM_MAX 4
			
 
				 
			
 
				 #endif /* __STARPU_OPENCL_UTILS_H__ */