10 years ago · 8a1120b2d1
--- a/Makefile.am
+++ b/Makefile.am
@@ -85,7 +85,8 @@ versinclude_HEADERS = 				\
 
																 	include/starpu_stdlib.h			\
															
 
																 	include/starpu_thread.h			\
															
 
																 	include/starpu_thread_util.h		\
															
 
																-	include/starpu_tree.h
															
 
																+	include/starpu_tree.h			\
															
 
																+	include/starpu_simgrid_wrap.h
															
 
																 nodist_versinclude_HEADERS = 			\
															
 
																 	include/starpu_config.h
															
--- a/configure.ac
+++ b/configure.ac
@@ -1820,6 +1820,8 @@ AC_SUBST(USE_MPI, $use_mpi)
 
																 AM_CONDITIONAL(USE_MPI, test x$use_mpi = xyes)
															
 
																 if test x$use_mpi = xyes; then
															
 
																 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
															
 
																+else
															
 
																+	running_mpi_check=no
															
 
																 fi
															
 
																 AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
															
--- a/doc/doxygen/chapters/13offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/13offline_performance_tools.doxy
@@ -119,8 +119,9 @@ $ vite paje.trace
 
																 To get names of tasks instead of "unknown", fill the optional
															
 
																 starpu_codelet::name, or use a performance model for them.
															
 
																 Details of the codelet execution can be obtained by passing
															
 
																-<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
															
 
																-(at least r1430).
															
 
																+\ref enable-paje-codelet-details "--enable-paje-codelet-details" when
															
 
																+configuring StarPU and using a recent enough version of ViTE (at least
															
 
																+r1430).
															
 
																 In the MPI execution case, collect the trace files from the MPI nodes, and
															
 
																 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
															
@@ -133,9 +134,10 @@ By default, all tasks are displayed using a green color. To display tasks with
 
																 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
															
 
																 To identify tasks precisely, the application can set the ::tag_id field of the
															
 
																-tasks (or use STARPU_TAG_ONY when using starpu_task_insert), and with a recent
															
 
																-enough version of vite (>= r1430) and the <c>--enable-paje-codelet-details</c>
															
 
																-configure option, the value of the tag will show up in the trace.
															
 
																+tasks (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
															
 
																+enough version of vite (>= r1430) and the
															
 
																+\ref enable-paje-codelet-details "--enable-paje-codelet-details"
															
 
																+StarPU configure option, the value of the tag will show up in the trace.
															
 
																 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
															
--- a/doc/doxygen/chapters/21simgrid.doxy
+++ b/doc/doxygen/chapters/21simgrid.doxy
@@ -22,9 +22,9 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 
																 virtual timestamp in us.
															
 
																 For some technical reason, the application's .c file which contains main() has
															
 
																-to be recompiled with starpu.h, which in the simgrid case will # define main()
															
 
																+to be recompiled with starpu_simgrid_wrap.h, which in the simgrid case will # define main()
															
 
																 into starpu_main(), and it is libstarpu which will provide the real main() and
															
 
																-call the application's main().
															
 
																+will call the application's main().
															
 
																 To be able to test with crazy data sizes, one may want to only allocate
															
 
																 application data if STARPU_SIMGRID is not defined.  Passing a NULL pointer to
															
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -57,8 +57,9 @@ case of e.g. available particular CUDA or OpenCL support.
 
																 \ingroup API_Data_Interfaces
															
 
																 \var starpu_data_copy_methods::can_copy
															
 
																 If defined, allows the interface to declare whether it supports transferring
															
 
																-from \p src_interface on node \p src_node to \p dst_interface on node \p. If not
															
 
																-defined, it is assumed that the interface supports all transfers.
															
 
																+from \p src_interface on node \p src_node to \p dst_interface on node \p
															
 
																+dst_node, run from node \p handling_node. If not defined, it is assumed that the
															
 
																+interface supports all transfers.
															
 
																 \var starpu_data_copy_methods::ram_to_ram
															
 
																 Define how to copy data from the \p src_interface interface on the \p
															
 
																 src_node CPU node to the \p dst_interface interface on the \p dst_node
															
@@ -1000,11 +1001,12 @@ DefiningANewDataInterface.
 
																 \ingroup API_Data_Interfaces
															
 
																 Allocate \p size bytes on node \p dst_node. This returns 0 if
															
 
																 allocation failed, the allocation method should then return <c>-ENOMEM</c> as
															
 
																-allocated size.
															
 
																+allocated size. Deallocation must be done with starpu_free_on_node.
															
 
																 \fn void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
															
 
																 \ingroup API_Data_Interfaces
															
 
																-Free \p addr of \p size bytes on node \p dst_node.
															
 
																+Free \p addr of \p size bytes on node \p dst_node which was previously allocated
															
 
																+with starpu_malloc_on_node.
															
 
																 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
															
 
																 \ingroup API_Data_Interfaces
															
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																- * Copyright (C) 2011, 2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011, 2013-2014  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -47,6 +47,7 @@ static struct starpu_codelet cl =
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	/* CUDA implementation of the codelet */
															
 
																 	.cuda_funcs = {scal_cuda_func, NULL},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																 	.nbuffers = 1,
															
 
																 	.model = &vector_scal_model
															
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -425,7 +425,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 
																 	size = src_custom->nx * 2 * sizeof(float);
															
 
																 	if (dst_custom->cpu_ptr == NULL)
															
 
																 	{
															
 
																-		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
															
 
																+		ret = starpu_opencl_allocate_memory(devid, (cl_mem*)&dst_custom->cpu_ptr,
															
 
																 				size, CL_MEM_READ_WRITE);
															
 
																 		assert(ret == CL_SUCCESS);
															
 
																 	}
															
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -58,6 +58,7 @@ static void parse_args(int argc, char **argv)
 
																 			nblocks = strtol(argv[++i], &argptr, 10);
															
 
																 		}
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 		else if (strcmp(argv[i], "-check") == 0)
															
 
																 		{
															
 
																 			check = 1;
															
@@ -72,6 +73,7 @@ static void parse_args(int argc, char **argv)
 
																 		{
															
 
																 			no_stride = 1;
															
 
																 		}
															
 
																+#endif
															
 
																 		else if (strcmp(argv[i], "-profile") == 0)
															
 
																 		{
															
@@ -315,6 +317,7 @@ int main(int argc, char **argv)
 
																 	starpu_cublas_init();
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	init_matrix();
															
 
																 	unsigned *ipiv = NULL;
															
@@ -364,6 +367,7 @@ int main(int argc, char **argv)
 
																 		}
															
 
																 	}
															
 
																 	else
															
 
																+#endif
															
 
																 	{
															
 
																 		ret = STARPU_LU(lu_decomposition)(A, size, size, nblocks);
															
 
																 	}
															
@@ -403,6 +407,7 @@ int main(int argc, char **argv)
 
																 		}
															
 
																 	}
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	if (check)
															
 
																 	{
															
 
																 		FPRINTF(stderr, "Checking result\n");
															
@@ -413,6 +418,7 @@ int main(int argc, char **argv)
 
																 		check_result();
															
 
																 	}
															
 
																+#endif
															
 
																 	starpu_free(A);
															
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -66,18 +66,13 @@ typedef UINT_PTR uintptr_t;
 
																 #include <starpu_fxt.h>
															
 
																 #include <starpu_driver.h>
															
 
																 #include <starpu_tree.h>
															
 
																+#include <starpu_simgrid_wrap.h>
															
 
																 #ifdef __cplusplus
															
 
																 extern "C"
															
 
																 {
															
 
																 #endif
															
 
																-#ifdef STARPU_SIMGRID
															
 
																-#ifndef main
															
 
																-#define main starpu_main
															
 
																-#endif
															
 
																-#endif
															
 
																-
															
 
																 struct starpu_conf
															
 
																 {
															
 
																 	int magic;
															
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -37,7 +37,7 @@ extern "C"
 
																 struct starpu_data_copy_methods
															
 
																 {
															
 
																-	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node);
															
 
																 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -76,7 +76,7 @@ int starpu_opencl_collect_stats(cl_event event);
 
																 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
															
 
																-cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
															
 
																+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *addr, size_t size, cl_mem_flags flags);
															
 
																 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
															
--- a/include/starpu_simgrid_wrap.h
+++ b/include/starpu_simgrid_wrap.h
@@ -0,0 +1,28 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2014  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_SIMGRID_WRAP_H__
															
 
																+#define __STARPU_SIMGRID_WRAP_H__
															
 
																+
															
 
																+#include <starpu_config.h>
															
 
																+
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+#ifndef main
															
 
																+#define main starpu_main
															
 
																+#endif
															
 
																+#endif
															
 
																+
															
 
																+#endif /* __STARPU_SIMGRID_WRAP_H__ */
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -173,12 +173,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
																 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
															
 
																-	 TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
															
 
																+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
															
 
																 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
															
 
																 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
															
 
																-	 TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
															
 
																+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
															
 
																 	 /* somebody is perhaps waiting for the MPI request to be posted */
															
 
																 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
@@ -304,12 +304,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
																 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
															
 
																-	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
															
 
																+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
															
 
																 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
															
 
																 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
															
 
																-	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
															
 
																+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
															
 
																 	/* somebody is perhaps waiting for the MPI request to be posted */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
@@ -413,12 +413,12 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 
																 	/* Which is the mpi request we are waiting for ? */
															
 
																 	struct _starpu_mpi_req *req = waiting_req->other_request;
															
 
																-	TRACE_MPI_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
															
 
																+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
															
 
																 	req->ret = MPI_Wait(&req->request, waiting_req->status);
															
 
																 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
															
 
																-	TRACE_MPI_UWAIT_END(req->srcdst, req->mpi_tag);
															
 
																+	_STARPU_MPI_TRACE_UWAIT_END(req->srcdst, req->mpi_tag);
															
 
																 	_starpu_mpi_handle_request_termination(req);
															
 
																 	_STARPU_MPI_LOG_OUT();
															
@@ -481,12 +481,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
																 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
															
 
																 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
															
 
																-	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
															
 
																+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
															
 
																 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
															
 
																 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
															
 
																-	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
															
 
																+	_STARPU_MPI_TRACE_UTESTING_END(req->srcdst, req->mpi_tag);
															
 
																 	if (*testing_req->flag)
															
 
																 	{
															
@@ -933,22 +933,22 @@ static void _starpu_mpi_test_detached_requests(void)
 
																 		{
															
 
																 			if (req->request_type == RECV_REQ)
															
 
																 			{
															
 
																-				TRACE_MPI_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
															
 
																+				_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
															
 
																 			}
															
 
																 			else if (req->request_type == SEND_REQ)
															
 
																 			{
															
 
																-				TRACE_MPI_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
															
 
																+				_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
															
 
																 			}
															
 
																 			_starpu_mpi_handle_request_termination(req);
															
 
																 			if (req->request_type == RECV_REQ)
															
 
																 			{
															
 
																-				TRACE_MPI_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
															
 
																+				_STARPU_MPI_TRACE_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
															
 
																 			}
															
 
																 			else if (req->request_type == SEND_REQ)
															
 
																 			{
															
 
																-				TRACE_MPI_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
															
 
																+				_STARPU_MPI_TRACE_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
															
 
																 			}
															
 
																 		}
															
@@ -1057,7 +1057,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
															
 
																 	{
															
 
																-		TRACE_MPI_START(rank, worldsize);
															
 
																+		_STARPU_MPI_TRACE_START(rank, worldsize);
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 		starpu_profiling_set_id(rank);
															
 
																 #endif //STARPU_USE_FXT
															
@@ -1097,14 +1097,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 		{
															
 
																 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
															
 
																-			TRACE_MPI_SLEEP_BEGIN();
															
 
																+			_STARPU_MPI_TRACE_SLEEP_BEGIN();
															
 
																 			if (barrier_running)
															
 
																 				/* Tell mpi_barrier */
															
 
																 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
															
 
																 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
															
 
																-			TRACE_MPI_SLEEP_END();
															
 
																+			_STARPU_MPI_TRACE_SLEEP_END();
															
 
																 		}
															
 
																 		/* get one request */
															
@@ -1323,7 +1323,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
																 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
															
 
																 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
															
 
																-	TRACE_MPI_BARRIER(rank, worldsize, random_number);
															
 
																+	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
															
 
																 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
															
 
																 #endif
															
@@ -1407,7 +1407,7 @@ int starpu_mpi_shutdown(void)
 
																 	starpu_progression_hook_deregister(hookid);
															
 
																 #endif /* STARPU_MPI_ACTIVITY */
															
 
																-	TRACE_MPI_STOP(rank, world_size);
															
 
																+	_STARPU_MPI_TRACE_STOP(rank, world_size);
															
 
																 	/* free the request queues */
															
 
																 	_starpu_mpi_req_list_delete(detached_requests);
															
--- a/mpi/src/starpu_mpi_fxt.h
+++ b/mpi/src/starpu_mpi_fxt.h
@@ -26,86 +26,86 @@
 
																 extern "C" {
															
 
																 #endif
															
 
																-#define FUT_MPI_START				0x5201
															
 
																-#define FUT_MPI_STOP				0x5202
															
 
																-#define FUT_MPI_BARRIER				0x5203
															
 
																-#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
															
 
																-#define FUT_MPI_ISEND_SUBMIT_END		0x5205
															
 
																-#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
															
 
																-#define FUT_MPI_IRECV_SUBMIT_END		0x5207
															
 
																-#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
															
 
																-#define FUT_MPI_ISEND_COMPLETE_END		0x5209
															
 
																-#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
															
 
																-#define FUT_MPI_IRECV_COMPLETE_END		0x5211
															
 
																-#define FUT_MPI_SLEEP_BEGIN			0x5212
															
 
																-#define FUT_MPI_SLEEP_END			0x5213
															
 
																-#define FUT_MPI_DTESTING_BEGIN			0x5214
															
 
																-#define FUT_MPI_DTESTING_END			0x5215
															
 
																-#define FUT_MPI_UTESTING_BEGIN			0x5216
															
 
																-#define FUT_MPI_UTESTING_END			0x5217
															
 
																-#define FUT_MPI_UWAIT_BEGIN			0x5218
															
 
																-#define FUT_MPI_UWAIT_END			0x5219
															
 
																+#define _STARPU_MPI_FUT_START				0x5201
															
 
																+#define _STARPU_MPI_FUT_STOP				0x5202
															
 
																+#define _STARPU_MPI_FUT_BARRIER				0x5203
															
 
																+#define _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN		0x5204
															
 
																+#define _STARPU_MPI_FUT_ISEND_SUBMIT_END		0x5205
															
 
																+#define _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN		0x5206
															
 
																+#define _STARPU_MPI_FUT_IRECV_SUBMIT_END		0x5207
															
 
																+#define _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN		0x5208
															
 
																+#define _STARPU_MPI_FUT_ISEND_COMPLETE_END		0x5209
															
 
																+#define _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN		0x5210
															
 
																+#define _STARPU_MPI_FUT_IRECV_COMPLETE_END		0x5211
															
 
																+#define _STARPU_MPI_FUT_SLEEP_BEGIN			0x5212
															
 
																+#define _STARPU_MPI_FUT_SLEEP_END			0x5213
															
 
																+#define _STARPU_MPI_FUT_DTESTING_BEGIN			0x5214
															
 
																+#define _STARPU_MPI_FUT_DTESTING_END			0x5215
															
 
																+#define _STARPU_MPI_FUT_UTESTING_BEGIN			0x5216
															
 
																+#define _STARPU_MPI_FUT_UTESTING_END			0x5217
															
 
																+#define _STARPU_MPI_FUT_UWAIT_BEGIN			0x5218
															
 
																+#define _STARPU_MPI_FUT_UWAIT_END			0x5219
															
 
																 #ifdef STARPU_USE_FXT
															
 
																-#define TRACE_MPI_START(rank, worldsize)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
															
 
																-#define TRACE_MPI_STOP(rank, worldsize)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
															
 
																-#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
															
 
																-	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
															
 
																-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
															
 
																-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																-#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
															
 
																-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
															
 
																-#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
															
 
																-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
															
 
																-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																-#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
															
 
																-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
															
 
																-#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
															
 
																-#define TRACE_MPI_SLEEP_BEGIN()	\
															
 
																-	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
															
 
																-#define TRACE_MPI_SLEEP_END()	\
															
 
																-	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
															
 
																-#define TRACE_MPI_DTESTING_BEGIN()	\
															
 
																-	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
															
 
																-#define TRACE_MPI_DTESTING_END()	\
															
 
																-	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
															
 
																-#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
															
 
																-#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
															
 
																-#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
															
 
																-#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
															
 
																-	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_START(rank, worldsize)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	\
															
 
																+	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
															
 
																+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
															
 
																+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
															
 
																+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
															
 
																+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
															
 
																+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_SLEEP_END()	\
															
 
																+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
															
 
																+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_DTESTING_END()	\
															
 
																+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_UTESTING_END(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
															
 
																+#define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
															
 
																 #define TRACE
															
 
																 #else
															
 
																-#define TRACE_MPI_START(a, b)				do {} while(0);
															
 
																-#define TRACE_MPI_STOP(a, b)				do {} while(0);
															
 
																-#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
															
 
																-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
															
 
																-#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
															
 
																-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
															
 
																-#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
															
 
																-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
															
 
																-#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
															
 
																-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
															
 
																-#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
															
 
																-#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
															
 
																-#define TRACE_MPI_SLEEP_END()				do {} while(0);
															
 
																-#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
															
 
																-#define TRACE_MPI_DTESTING_END()			do {} while(0);
															
 
																-#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
															
 
																-#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
															
 
																-#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
															
 
																-#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()				do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_SLEEP_END()				do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()			do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_DTESTING_END()			do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(a, b)			do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
															
 
																+#define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
															
 
																 #endif
															
 
																 #ifdef __cplusplus
															
--- a/src/common/starpu_spinlock.c
+++ b/src/common/starpu_spinlock.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -24,15 +24,17 @@
 
																 int _starpu_spin_init(struct _starpu_spinlock *lock)
															
 
																 {
															
 
																 #if defined(STARPU_SPINLOCK_CHECK)
															
 
																+	starpu_pthread_mutexattr_t errcheck_attr;
															
 
																 //	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
															
 
																 	int ret;
															
 
																-	ret = starpu_pthread_mutexattr_init(&lock->errcheck_attr);
															
 
																+	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
															
 
																-	ret = starpu_pthread_mutexattr_settype(&lock->errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
															
 
																+	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
															
 
																 	STARPU_ASSERT(!ret);
															
 
																-	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &lock->errcheck_attr);
															
 
																+	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &errcheck_attr);
															
 
																+	starpu_pthread_mutexattr_destroy(&errcheck_attr);
															
 
																 	return ret;
															
 
																 #else
															
 
																 	int ret = starpu_pthread_spin_init(&lock->lock, 0);
															
@@ -44,7 +46,6 @@ int _starpu_spin_init(struct _starpu_spinlock *lock)
 
																 int _starpu_spin_destroy(struct _starpu_spinlock *lock STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 #if defined(STARPU_SPINLOCK_CHECK)
															
 
																-	starpu_pthread_mutexattr_destroy(&lock->errcheck_attr);
															
 
																 	return starpu_pthread_mutex_destroy(&lock->errcheck_lock);
															
 
																 #else
															
 
																 	return starpu_pthread_spin_destroy(&lock->lock);
															
--- a/src/common/starpu_spinlock.h
+++ b/src/common/starpu_spinlock.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -25,7 +25,6 @@
 
																 struct _starpu_spinlock
															
 
																 {
															
 
																 #if defined(STARPU_SPINLOCK_CHECK)
															
 
																-	starpu_pthread_mutexattr_t errcheck_attr;
															
 
																 	starpu_pthread_mutex_t errcheck_lock;
															
 
																 	const char *last_taker;
															
 
																 #else
															
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -83,10 +83,24 @@ int _starpu_mkpath(const char *s, mode_t mode)
 
																 	if ((_starpu_mkpath(up, mode) == -1) && (errno != EEXIST))
															
 
																 		goto out;
															
 
																-	if ((mkdir(path, mode) == -1) && (errno != EEXIST))
															
 
																-		rv = -1;
															
 
																-	else
															
 
																+	struct stat sb;
															
 
																+	if (stat(path, &sb) == 0)
															
 
																+	{
															
 
																+		if (!S_ISDIR(sb.st_mode))
															
 
																+		{
															
 
																+			fprintf(stderr,"Error: %s is not a directory:\n", path);
															
 
																+			STARPU_ABORT();
															
 
																+		}
															
 
																+		/* It already exists and is a directory.  */
															
 
																 		rv = 0;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		if ((mkdir(path, mode) == -1) && (errno != EEXIST))
															
 
																+			rv = -1;
															
 
																+		else
															
 
																+			rv = 0;
															
 
																+	}
															
 
																 out:
															
 
																 	olderrno = errno;
															
@@ -105,23 +119,11 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 
																 	ret = _starpu_mkpath(path, mode);
															
 
																-	if (ret == -1)
															
 
																+	if (ret == -1 && errno != EEXIST)
															
 
																 	{
															
 
																-		if (errno != EEXIST)
															
 
																-		{
															
 
																-			fprintf(stderr,"Error making StarPU directory %s:\n", path);
															
 
																-			perror("mkdir");
															
 
																-			STARPU_ABORT();
															
 
																-		}
															
 
																-
															
 
																-		/* make sure that it is actually a directory */
															
 
																-		struct stat sb;
															
 
																-		stat(path, &sb);
															
 
																-		if (!S_ISDIR(sb.st_mode))
															
 
																-		{
															
 
																-			fprintf(stderr,"Error: %s is not a directory:\n", path);
															
 
																-			STARPU_ABORT();
															
 
																-		}
															
 
																+		fprintf(stderr,"Error making StarPU directory %s:\n", path);
															
 
																+		perror("mkdir");
															
 
																+		STARPU_ABORT();
															
 
																 	}
															
 
																 }
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1230,7 +1230,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
																 			char archname[32];
															
 
																 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
															
 
																-			_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
															
 
																+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry && entry->history_entry ? entry->history_entry->nsample : 0);
															
 
																 			_starpu_set_calibrate_flag(1);
															
 
																 			model->benchmarking = 1;
															
 
																 		}
															
@@ -1272,7 +1272,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
																 		char archname[32];
															
 
																 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
															
 
																-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
															
 
																+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry ? entry->nsample : 0);
															
 
																 		_starpu_set_calibrate_flag(1);
															
 
																 		model->benchmarking = 1;
															
 
																 	}
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -841,6 +841,7 @@ pick:
 
																 	 * We do have a task that uses multiformat handles. Let's create the
															
 
																 	 * required conversion tasks.
															
 
																 	 */
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
															
 
																 	unsigned i;
															
 
																 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
															
 
																 	for (i = 0; i < nbuffers; i++)
															
@@ -864,6 +865,7 @@ pick:
 
																 	task->mf_skip = 1;
															
 
																 	starpu_task_list_push_back(&worker->local_tasks, task);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
															
 
																 	goto pick;
															
 
																 profiling:
															
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -27,6 +27,7 @@
 
																 #ifdef STARPU_SIMGRID
															
 
																 #include <msg/msg.h>
															
 
																 #include <smpi/smpif.h>
															
 
																+#include <sys/resource.h>
															
 
																 #define STARPU_MPI_AS_PREFIX "StarPU-MPI"
															
@@ -160,7 +161,7 @@ int main(int argc, char **argv)
 
																 	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
															
 
																 	{
															
 
																-		_STARPU_ERROR("The main file of this application needs to be compiled with starpu.h included, to properly define starpu_main\n");
															
 
																+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
															
 
																 		exit(EXIT_FAILURE);
															
 
																 	}
															
@@ -178,7 +179,12 @@ int main(int argc, char **argv)
 
																 #endif
															
 
																 	/* Simgrid uses tiny stacks by default.  This comes unexpected to our users.  */
															
 
																 	extern xbt_cfg_t _sg_cfg_set;
															
 
																-	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", 8192);
															
 
																+	unsigned stack_size = 8192;
															
 
																+	struct rlimit rlim;
															
 
																+	if (getrlimit(RLIMIT_STACK, &rlim) == 0 && rlim.rlim_cur != 0 && rlim.rlim_cur != RLIM_INFINITY)
															
 
																+		stack_size = rlim.rlim_cur / 1024;
															
 
																+
															
 
																+	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", stack_size);
															
 
																 	/* Load XML platform */
															
 
																 	_starpu_simgrid_get_platform_path(path, sizeof(path));
															
@@ -196,6 +202,12 @@ void _starpu_simgrid_init()
 
																 	xbt_dynar_t hosts;
															
 
																 	int i;
															
 
																+	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
															
 
																+	{
															
 
																+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
															
 
																+		exit(EXIT_FAILURE);
															
 
																+	}
															
 
																+
															
 
																 #ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
															
 
																 	if (_starpu_simgrid_running_smpi())
															
 
																 	{
															
@@ -253,6 +265,10 @@ void _starpu_simgrid_init()
 
																 	xbt_dynar_free(&hosts);
															
 
																 }
															
 
																+/*
															
 
																+ * Tasks
															
 
																+ */
															
 
																+
															
 
																 /* Task execution submitted by StarPU */
															
 
																 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
															
 
																 {
															
@@ -276,8 +292,13 @@ void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_
 
																 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
															
 
																 			0, NULL);
															
 
																 	MSG_task_execute(simgrid_task);
															
 
																+	MSG_task_destroy(simgrid_task);
															
 
																 }
															
 
																+/*
															
 
																+ * Transfers
															
 
																+ */
															
 
																+
															
 
																 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
															
 
																 LIST_TYPE(transfer,
															
 
																 	msg_task_t task;
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1218,16 +1218,17 @@ out:
 
																 			STARPU_ASSERT(worker->local_ordered_tasks[n] == NULL);
															
 
																 		_starpu_sched_ctx_list_delete(&worker->sched_ctx_list);
															
 
																 		_starpu_job_list_delete(worker->terminated_jobs);
															
 
																+		free(worker->local_ordered_tasks);
															
 
																 	}
															
 
																 }
															
 
																 /* Condition variable and mutex used to pause/resume. */
															
 
																 static starpu_pthread_cond_t pause_cond = STARPU_PTHREAD_COND_INITIALIZER;
															
 
																 static starpu_pthread_mutex_t pause_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
															
 
																-unsigned _starpu_machine_is_running(void)
															
 
																+
															
 
																+void _starpu_may_pause(void)
															
 
																 {
															
 
																-	unsigned ret;
															
 
																-	/* running and pause_depth are just protected by a memory barrier */
															
 
																+	/* pause_depth is just protected by a memory barrier */
															
 
																 	STARPU_RMB();
															
 
																 	if (STARPU_UNLIKELY(config.pause_depth > 0)) {
															
@@ -1237,6 +1238,13 @@ unsigned _starpu_machine_is_running(void)
 
																 		}
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
															
 
																 	}
															
 
																+}
															
 
																+
															
 
																+unsigned _starpu_machine_is_running(void)
															
 
																+{
															
 
																+	unsigned ret;
															
 
																+	/* running is just protected by a memory barrier */
															
 
																+	STARPU_RMB();
															
 
																 	ANNOTATE_HAPPENS_AFTER(&config.running);
															
 
																 	ret = config.running;
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -354,6 +354,9 @@ char ***_starpu_get_argv();
 
																 /* Fill conf with environment variables */
															
 
																 void _starpu_conf_check_environment(struct starpu_conf *conf);
															
 
																+/* Called by the driver when it is ready to pause  */
															
 
																+void _starpu_may_pause(void);
															
 
																+
															
 
																 /* Has starpu_shutdown already been called ? */
															
 
																 unsigned _starpu_machine_is_running(void);
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -41,8 +41,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
																 	double cost = INFINITY;
															
 
																 	unsigned src_node_mask = 0;
															
 
																-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
															
 
																-
															
 
																 	for (node = 0; node < nnodes; node++)
															
 
																 	{
															
 
																 		if (handle->per_node[node].state != STARPU_INVALID)
															
@@ -74,15 +72,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
																 				double time = starpu_transfer_predict(i, destination, size);
															
 
																 				unsigned handling_node;
															
 
																-				/* Avoid transfers which the interface does not want */
															
 
																-				if (copy_methods->can_copy)
															
 
																-				{
															
 
																-					void *src_interface = handle->per_node[i].data_interface;
															
 
																-					void *dst_interface = handle->per_node[destination].data_interface;
															
 
																-					if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
															
 
																-						continue;
															
 
																-				}
															
 
																-
															
 
																 				/* Avoid indirect transfers */
															
 
																 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
															
 
																 					continue;
															
@@ -115,22 +104,22 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
																 		if (src_node_mask & (1<<i))
															
 
																 		{
															
 
																+			int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
															
 
																 			/* Avoid transfers which the interface does not want */
															
 
																-			if (copy_methods->can_copy)
															
 
																+			if (can_copy)
															
 
																 			{
															
 
																 				void *src_interface = handle->per_node[i].data_interface;
															
 
																 				void *dst_interface = handle->per_node[destination].data_interface;
															
 
																 				unsigned handling_node;
															
 
																-				if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
															
 
																-					continue;
															
 
																-
															
 
																 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
															
 
																 				{
															
 
																 					/* Avoid through RAM if the interface does not want it */
															
 
																 					void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
															
 
																-					if (!copy_methods->can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM)
															
 
																-					 || !copy_methods->can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination))
															
 
																+					if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
															
 
																+					  && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
															
 
																+					 || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
															
 
																+					  && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
															
 
																 						continue;
															
 
																 				}
															
 
																 			}
															
@@ -251,7 +240,9 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
																 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
															
 
																 {
															
 
																-	(void) handle; // unused
															
 
																+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
															
 
																+	void *src_interface = handle->per_node[src_node].data_interface;
															
 
																+	void *dst_interface = handle->per_node[dst_node].data_interface;
															
 
																 	/* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
															
 
																 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
															
@@ -260,19 +251,19 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 
																 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
															
 
																 	{
															
 
																 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
															
 
																-		if (!copy_methods->cuda_to_cuda_async)
															
 
																+		if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
															
 
																 			return 0;
															
 
																 	}
															
 
																 #endif
															
 
																 	/* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
															
 
																-	if (worker_supports_direct_access(src_node, dst_node))
															
 
																+	if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
															
 
																 	{
															
 
																 		*handling_node = dst_node;
															
 
																 		return 1;
															
 
																 	}
															
 
																-	if (worker_supports_direct_access(dst_node, src_node))
															
 
																+	if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
															
 
																 	{
															
 
																 		*handling_node = src_node;
															
 
																 		return 1;
															
@@ -319,6 +310,10 @@ static int determine_request_path(starpu_data_handle_t handle,
 
																 	if (!link_is_valid)
															
 
																 	{
															
 
																+		int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
															
 
																+		void *src_interface = handle->per_node[src_node].data_interface;
															
 
																+		void *dst_interface = handle->per_node[dst_node].data_interface;
															
 
																+
															
 
																 		/* We need an intermediate hop to implement data staging
															
 
																 		 * through main memory. */
															
 
																 		STARPU_ASSERT(max_len >= 2);
															
@@ -326,12 +321,36 @@ static int determine_request_path(starpu_data_handle_t handle,
 
																 		/* GPU -> RAM */
															
 
																 		src_nodes[0] = src_node;
															
 
																 		dst_nodes[0] = STARPU_MAIN_RAM;
															
 
																-		handling_nodes[0] = starpu_node_get_kind(src_node) == STARPU_DISK_RAM ? dst_node : src_node;
															
 
																+
															
 
																+		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
															
 
																+			/* Disks don't have their own driver thread */
															
 
																+			handling_nodes[0] = dst_node;
															
 
																+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
															
 
																+		{
															
 
																+			handling_nodes[0] = src_node;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
															
 
																+			handling_nodes[0] = dst_node;
															
 
																+		}
															
 
																 		/* RAM -> GPU */
															
 
																 		src_nodes[1] = STARPU_MAIN_RAM;
															
 
																 		dst_nodes[1] = dst_node;
															
 
																-		handling_nodes[1] = starpu_node_get_kind(dst_node) == STARPU_DISK_RAM ? src_node : dst_node;
															
 
																+
															
 
																+		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
															
 
																+			/* Disks don't have their own driver thread */
															
 
																+			handling_nodes[1] = src_node;
															
 
																+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
															
 
																+		{
															
 
																+			handling_nodes[1] = dst_node;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
															
 
																+			handling_nodes[1] = src_node;
															
 
																+		}
															
 
																 		return 2;
															
 
																 	}
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -155,17 +155,22 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 
																 {
															
 
																 	int retval;
															
 
																 	int do_delete = 0;
															
 
																+	int completed;
															
 
																 	unsigned local_node = _starpu_memory_node_get_local_key();
															
 
																 	do
															
 
																 	{
															
 
																-		_starpu_spin_lock(&r->lock);
															
 
																-
															
 
																-		if (r->completed)
															
 
																-			break;
															
 
																-
															
 
																-		_starpu_spin_unlock(&r->lock);
															
 
																+		STARPU_HG_DISABLE_CHECKING(r->completed);
															
 
																+		completed = r->completed;
															
 
																+		STARPU_HG_ENABLE_CHECKING(r->completed);
															
 
																+		if (completed)
															
 
																+		{
															
 
																+			_starpu_spin_lock(&r->lock);
															
 
																+			if (r->completed)
															
 
																+				break;
															
 
																+			_starpu_spin_unlock(&r->lock);
															
 
																+		}
															
 
																 #ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																 		_starpu_wake_all_blocked_workers_on_node(r->handling_node);
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -50,7 +50,7 @@ struct malloc_pinned_codelet_struct
 
																 //{
															
 
																 //	struct malloc_pinned_codelet_struct *s = arg;
															
 
																 //        //        *(s->ptr) = malloc(s->dim);
															
 
																-//        starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
															
 
																+//        starpu_opencl_allocate_memory(devid, (void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
															
 
																 //}
															
 
																 //#endif
															
@@ -404,6 +404,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
																 			STARPU_ASSERT(last[dst_node] >= addr);
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
															
 
																 #else
															
 
																+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
															
 
																+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
															
 
																+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
															
 
																+#if defined(HAVE_CUDA_MEMCPY_PEER)
															
 
																+				starpu_cuda_set_device(devid);
															
 
																+#else
															
 
																+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
															
 
																+#endif
															
 
																 			status = cudaMalloc((void **)&addr, size);
															
 
																 			if (!addr || (status != cudaSuccess))
															
 
																 			{
															
@@ -433,7 +441,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
																                                 int ret;
															
 
																 				cl_mem ptr;
															
 
																-				ret = starpu_opencl_allocate_memory(&ptr, size, CL_MEM_READ_WRITE);
															
 
																+				ret = starpu_opencl_allocate_memory(_starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
															
 
																 				if (ret)
															
 
																 				{
															
 
																 					addr = 0;
															
@@ -505,6 +513,14 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
															
 
																 #else
															
 
																 			cudaError_t err;
															
 
																+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
															
 
																+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
															
 
																+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
															
 
																+#if defined(HAVE_CUDA_MEMCPY_PEER)
															
 
																+				starpu_cuda_set_device(devid);
															
 
																+#else
															
 
																+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
															
 
																+#endif
															
 
																 			err = cudaFree((void*)addr);
															
 
																 			if (STARPU_UNLIKELY(err != cudaSuccess))
															
 
																 				STARPU_CUDA_REPORT_ERROR(err);
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -440,19 +440,24 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
																 	if (!old_replicate)
															
 
																 	{
															
 
																+		/* Free the copy that we made */
															
 
																 		free(mc->chunk_interface);
															
 
																 		mc->chunk_interface = NULL;
															
 
																 	}
															
 
																-	mc->data = new_replicate->handle;
															
 
																-	/* mc->ops, mc->footprint and mc->interface should be
															
 
																+	/* XXX: We do not actually reuse the mc at the moment, only the interface */
															
 
																+
															
 
																+	/* mc->data = new_replicate->handle; */
															
 
																+	/* mc->footprint, mc->ops, mc->size_interface, mc->automatically_allocated should be
															
 
																  	 * unchanged ! */
															
 
																-	/* reinsert the mem chunk in the list of active memory chunks */
															
 
																-	if (!is_already_in_mc_list)
															
 
																+	/* remove the mem chunk from the list of active memory chunks, register_mem_chunk will put it back later */
															
 
																+	if (is_already_in_mc_list)
															
 
																 	{
															
 
																-		_starpu_mem_chunk_list_push_back(mc_list[node], mc);
															
 
																+		_starpu_mem_chunk_list_erase(mc_list[node], mc);
															
 
																 	}
															
 
																+
															
 
																+	free(mc);
															
 
																 }
															
 
																 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -1893,79 +1893,79 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
																 				handle_user_event(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_START:
															
 
																+			case _STARPU_MPI_FUT_START:
															
 
																 				handle_mpi_start(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_STOP:
															
 
																+			case _STARPU_MPI_FUT_STOP:
															
 
																 				handle_mpi_stop(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_BARRIER:
															
 
																+			case _STARPU_MPI_FUT_BARRIER:
															
 
																 				handle_mpi_barrier(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_ISEND_SUBMIT_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN:
															
 
																 				handle_mpi_isend_submit_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_ISEND_SUBMIT_END:
															
 
																+			case _STARPU_MPI_FUT_ISEND_SUBMIT_END:
															
 
																 				handle_mpi_isend_submit_end(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_IRECV_SUBMIT_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN:
															
 
																 				handle_mpi_irecv_submit_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_IRECV_SUBMIT_END:
															
 
																+			case _STARPU_MPI_FUT_IRECV_SUBMIT_END:
															
 
																 				handle_mpi_irecv_submit_end(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_ISEND_COMPLETE_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN:
															
 
																 				handle_mpi_isend_complete_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_ISEND_COMPLETE_END:
															
 
																+			case _STARPU_MPI_FUT_ISEND_COMPLETE_END:
															
 
																 				handle_mpi_isend_complete_end(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_IRECV_COMPLETE_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN:
															
 
																 				handle_mpi_irecv_complete_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_IRECV_COMPLETE_END:
															
 
																+			case _STARPU_MPI_FUT_IRECV_COMPLETE_END:
															
 
																 				handle_mpi_irecv_complete_end(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_SLEEP_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_SLEEP_BEGIN:
															
 
																 				handle_mpi_sleep_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_SLEEP_END:
															
 
																+			case _STARPU_MPI_FUT_SLEEP_END:
															
 
																 				handle_mpi_sleep_end(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_DTESTING_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_DTESTING_BEGIN:
															
 
																 				handle_mpi_dtesting_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_DTESTING_END:
															
 
																+			case _STARPU_MPI_FUT_DTESTING_END:
															
 
																 				handle_mpi_dtesting_end(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_UTESTING_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_UTESTING_BEGIN:
															
 
																 				handle_mpi_utesting_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_UTESTING_END:
															
 
																+			case _STARPU_MPI_FUT_UTESTING_END:
															
 
																 				handle_mpi_utesting_end(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_UWAIT_BEGIN:
															
 
																+			case _STARPU_MPI_FUT_UWAIT_BEGIN:
															
 
																 				handle_mpi_uwait_begin(&ev, options);
															
 
																 				break;
															
 
																-			case FUT_MPI_UWAIT_END:
															
 
																+			case _STARPU_MPI_FUT_UWAIT_END:
															
 
																 				handle_mpi_uwait_end(&ev, options);
															
 
																 				break;
															
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -74,7 +74,7 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 
																 			break;
															
 
																 		}
															
 
																-		if (ev.code == FUT_MPI_BARRIER)
															
 
																+		if (ev.code == _STARPU_MPI_FUT_BARRIER)
															
 
																 		{
															
 
																 			/* We found the sync point */
															
 
																 			*offset = ev.time;
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -301,7 +301,10 @@ _starpu_cpu_worker(void *arg)
 
																 	_starpu_cpu_driver_init(args);
															
 
																 	while (_starpu_machine_is_running())
															
 
																+	{
															
 
																+		_starpu_may_pause();
															
 
																 		_starpu_cpu_driver_run_once(args);
															
 
																+	}
															
 
																 	_starpu_cpu_driver_deinit(args);
															
 
																 	return NULL;
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -567,6 +567,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
																 		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
															
 
																 		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
															
 
																+		if (worker->pipeline_length > STARPU_MAX_PIPELINE)
															
 
																+		{
															
 
																+			_STARPU_DISP("Warning: STARPU_CUDA_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
															
 
																+			worker->pipeline_length = STARPU_MAX_PIPELINE;
															
 
																+		}
															
 
																 		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
															
 
																 	}
															
@@ -752,7 +757,10 @@ void *_starpu_cuda_worker(void *_arg)
 
																 	_starpu_cuda_driver_init(worker);
															
 
																 	_STARPU_TRACE_START_PROGRESS(memnode);
															
 
																 	while (_starpu_machine_is_running())
															
 
																+	{
															
 
																+		_starpu_may_pause();
															
 
																 		_starpu_cuda_driver_run_once(worker);
															
 
																+	}
															
 
																 	_STARPU_TRACE_END_PROGRESS(memnode);
															
 
																 	_starpu_cuda_driver_deinit(worker);
															
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -343,6 +343,7 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 
																 	while(_starpu_machine_is_running())
															
 
																 	{
															
 
																+		_starpu_may_pause();
															
 
																 		if (gordon_busy_enough())
															
 
																 		{
															
 
																 			/* gordon already has enough work, wait a little TODO */
															
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -683,6 +683,8 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
 
																 		int res;
															
 
																 		struct _starpu_job * j;
															
 
																+		_starpu_may_pause();
															
 
																+
															
 
																 		_STARPU_TRACE_START_PROGRESS(memnode);
															
 
																 		_starpu_datawizard_progress(memnode, 1);
															
 
																 		_STARPU_TRACE_END_PROGRESS(memnode);
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -214,16 +214,15 @@ cl_int _starpu_opencl_deinit_context(int devid)
 
																 }
															
 
																 #endif
															
 
																-cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
															
 
																+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 	STARPU_ABORT();
															
 
																 #else
															
 
																 	cl_int err;
															
 
																         cl_mem memory;
															
 
																-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
															
 
																-	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
															
 
																+	memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
															
 
																 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
															
 
																         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
@@ -234,7 +233,7 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t
 
																 	 */
															
 
																 	char dummy = 0;
															
 
																 	cl_event ev;
															
 
																-	err = clEnqueueWriteBuffer(alloc_queues[worker->devid], memory, CL_TRUE,
															
 
																+	err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
															
 
																 				   0, sizeof(dummy), &dummy,
															
 
																 				   0, NULL, &ev);
															
 
																 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
															
@@ -597,6 +596,11 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 
																 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
															
 
																 	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
															
 
																+	if (worker->pipeline_length > STARPU_MAX_PIPELINE)
															
 
																+	{
															
 
																+		_STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
															
 
																+		worker->pipeline_length = STARPU_MAX_PIPELINE;
															
 
																+	}
															
 
																 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
															
@@ -741,7 +745,10 @@ void *_starpu_opencl_worker(void *_arg)
 
																 	_starpu_opencl_driver_init(worker);
															
 
																 	_STARPU_TRACE_START_PROGRESS(memnode);
															
 
																 	while (_starpu_machine_is_running())
															
 
																+	{
															
 
																+		_starpu_may_pause();
															
 
																 		_starpu_opencl_driver_run_once(worker);
															
 
																+	}
															
 
																 	_starpu_opencl_driver_deinit(worker);
															
 
																 	_STARPU_TRACE_END_PROGRESS(memnode);
															
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  INRIA
															
 
																  *
															
@@ -91,6 +91,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 
																 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
															
 
																 	struct starpu_sched_ctx_iterator it;
															
 
																+#ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																+	char dowake[STARPU_NMAXWORKERS] = { 0 };
															
 
																+#endif
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -112,20 +115,35 @@ static int push_task_eager_policy(struct starpu_task *task)
 
																 #ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 				starpu_bitmap_unset(data->waiters, worker);
															
 
																 				/* We really woke at least somebody, no need to wake somebody else */
															
 
																-				goto out;
															
 
																+				break;
															
 
																 #else
															
 
																-				starpu_pthread_mutex_t *sched_mutex;
															
 
																-				starpu_pthread_cond_t *sched_cond;
															
 
																-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
															
 
																-
															
 
																-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
															
 
																-				    goto out; // wake up a single worker
															
 
																+				dowake[worker] = 1;
															
 
																 #endif
															
 
																 			}
															
 
																 	}
															
 
																-out:
															
 
																+	/* Let the task free */
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
															
 
																+#ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																+	/* Now that we have a list of potential workers, try to wake one */
															
 
																+	if(workers->init_iterator)
															
 
																+		workers->init_iterator(workers, &it);
															
 
																+	
															
 
																+	while(workers->has_next(workers, &it))
															
 
																+	{
															
 
																+		worker = workers->get_next(workers, &it);
															
 
																+		if (dowake[worker])
															
 
																+		{
															
 
																+			starpu_pthread_mutex_t *sched_mutex;
															
 
																+			starpu_pthread_cond_t *sched_cond;
															
 
																+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
															
 
																+
															
 
																+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
															
 
																+				break; // wake up a single worker
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																 	return 0;
															
 
																 }
															
@@ -154,9 +172,11 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
																 	if (_starpu_fifo_empty(data->fifo))
															
 
																 		return NULL;
															
 
																+#ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	if (starpu_bitmap_get(data->waiters, workerid))
															
 
																 		/* Nobody woke us, avoid bothering the mutex */
															
 
																 		return NULL;
															
 
																+#endif
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
															
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  INRIA
															
 
																  *
															
@@ -139,6 +139,9 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 
																 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
															
 
																 	struct starpu_sched_ctx_iterator it;
															
 
																+#ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																+	char dowake[STARPU_NMAXWORKERS] = { 0 };
															
 
																+#endif
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -160,20 +163,35 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 
																 #ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 				starpu_bitmap_unset(data->waiters, worker);
															
 
																 				/* We really woke at least somebody, no need to wake somebody else */
															
 
																-				goto out;
															
 
																+				break;
															
 
																 #else
															
 
																-				starpu_pthread_mutex_t *sched_mutex;
															
 
																-				starpu_pthread_cond_t *sched_cond;
															
 
																-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
															
 
																-
															
 
																-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
															
 
																-				    goto out; // wake up a single worker
															
 
																+				dowake[worker] = 1;
															
 
																 #endif
															
 
																 			}
															
 
																 	}
															
 
																-out:
															
 
																+	/* Let the task free */
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
															
 
																+#ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																+	/* Now that we have a list of potential workers, try to wake one */
															
 
																+	if(workers->init_iterator)
															
 
																+		workers->init_iterator(workers, &it);
															
 
																+	
															
 
																+	while(workers->has_next(workers, &it))
															
 
																+	{
															
 
																+		worker = workers->get_next(workers, &it);
															
 
																+		if (dowake[worker])
															
 
																+		{
															
 
																+			starpu_pthread_mutex_t *sched_mutex;
															
 
																+			starpu_pthread_cond_t *sched_cond;
															
 
																+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
															
 
																+
															
 
																+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
															
 
																+				break; // wake up a single worker
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																 	return 0;
															
 
																 }
															
@@ -194,9 +212,11 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
																 	if (taskq->total_ntasks == 0)
															
 
																 		return NULL;
															
 
																+#ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	if (starpu_bitmap_get(data->waiters, workerid))
															
 
																 		/* Nobody woke us, avoid bothering the mutex */
															
 
																 		return NULL;
															
 
																+#endif
															
 
																 	/* release this mutex before trying to wake up other workers */
															
 
																 	starpu_pthread_mutex_t *curr_sched_mutex;
															
--- a/src/sched_policies/locality_work_stealing_policy.c
+++ b/src/sched_policies/locality_work_stealing_policy.c
@@ -224,13 +224,12 @@ static int lws_push_task(struct starpu_task *task)
 
																 #ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
															
 
																 	struct starpu_sched_ctx_iterator it;
															
 
																+	unsigned worker;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
 
																 	while(workers->has_next(workers, &it))
															
 
																 	{
															
 
																 		worker = workers->get_next(workers, &it);
															
 
																-		starpu_pthread_mutex_t *sched_mutex;
															
 
																-		starpu_pthread_cond_t *sched_cond;
															
 
																 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
															
 
																 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
															
 
																 	}
															
@@ -368,6 +367,6 @@ struct starpu_sched_policy _starpu_sched_lws_policy =
 
																 	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = NULL,
															
 
																-	.policy_name = "nws",
															
 
																-	.policy_description = "new work stealing"
															
 
																+	.policy_name = "lws",
															
 
																+	.policy_description = "locality work stealing"
															
 
																 };
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -151,6 +151,7 @@ noinst_PROGRAMS =				\
 
																 	datawizard/acquire_cb_insert		\
															
 
																 	datawizard/acquire_release		\
															
 
																 	datawizard/acquire_release2		\
															
 
																+	datawizard/cache			\
															
 
																 	datawizard/commute			\
															
 
																 	datawizard/copy				\
															
 
																 	datawizard/data_implicit_deps		\
															
--- a/tests/datawizard/cache.c
+++ b/tests/datawizard/cache.c
@@ -0,0 +1,100 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include "../helper.h"
															
 
																+
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																+static void codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																+{
															
 
																+     FPRINTF(stderr, "%lx\n", (unsigned long) STARPU_VARIABLE_GET_PTR(descr[0]));
															
 
																+     FPRINTF(stderr, "codelet\n");
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static struct starpu_codelet cuda_cl =
															
 
																+{
															
 
																+     .cuda_funcs = {codelet, NULL},
															
 
																+     .nbuffers = 1,
															
 
																+     .modes = {STARPU_R}
															
 
																+};
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static struct starpu_codelet opencl_cl =
															
 
																+{
															
 
																+     .opencl_funcs = {codelet, NULL},
															
 
																+     .nbuffers = 1,
															
 
																+     .modes = {STARPU_R}
															
 
																+};
															
 
																+#endif
															
 
																+
															
 
																+void dotest(struct starpu_codelet *cl)
															
 
																+{
															
 
																+     int ret;
															
 
																+     int var = 42;
															
 
																+     starpu_data_handle_t handle;
															
 
																+
															
 
																+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
															
 
																+
															
 
																+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
															
 
																+     if (ret == -ENODEV) goto enodev;
															
 
																+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+
															
 
																+     starpu_task_wait_for_all();
															
 
																+
															
 
																+     starpu_data_unregister(handle);
															
 
																+
															
 
																+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
															
 
																+
															
 
																+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
															
 
																+     if (ret == -ENODEV) goto enodev;
															
 
																+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+
															
 
																+     starpu_task_wait_for_all();
															
 
																+
															
 
																+enodev:
															
 
																+     starpu_data_unregister(handle);
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+     int ret;
															
 
																+
															
 
																+     ret = starpu_init(NULL);
															
 
																+     if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+     dotest(&cuda_cl);
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+     dotest(&opencl_cl);
															
 
																+#endif
															
 
																+
															
 
																+     starpu_shutdown();
															
 
																+
															
 
																+     return 0;
															
 
																+
															
 
																+enodev:
															
 
																+     starpu_shutdown();
															
 
																+     /* yes, we do not perform the computation but we did detect that no one
															
 
																+      * could perform the kernel, so this is not an error from StarPU */
															
 
																+     fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																+     return STARPU_TEST_SKIPPED;
															
 
																+}
															
--- a/tests/datawizard/gpu_ptr_register.c
+++ b/tests/datawizard/gpu_ptr_register.c
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 
																 static int
															
 
																 find_a_worker(enum starpu_worker_archtype type)
															
 
																 {
															
 
																-	int worker;
															
 
																-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
															
 
																+	int worker[STARPU_NMAXWORKERS];
															
 
																+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
															
 
																 	if (ret == 0)
															
 
																 		return -ENODEV;
															
 
																-	return worker;
															
 
																+	if (ret == -ERANGE)
															
 
																+		return worker[STARPU_NMAXWORKERS-1];
															
 
																+	return worker[ret-1];
															
 
																 }
															
 
																 static int
															
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 
																 }
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#if CUDART_VERSION >= 4000
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																 static int
															
 
																 test_cuda(void)
															
 
																 {
															
@@ -100,8 +102,7 @@ test_cuda(void)
 
																 	size = 10 * n;
															
 
																 	devid = starpu_worker_get_devid(chosen);
															
 
																-	starpu_cuda_set_device(devid);
															
 
																-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
															
 
																+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
															
 
																 	foo = calloc(size, sizeof(*foo));
															
 
																 	for (i = 0; i < size; i++)
															
@@ -180,9 +181,7 @@ test_opencl(void)
 
																 	starpu_opencl_get_context(devid, &context);
															
 
																 	starpu_opencl_get_queue(devid, &queue);
															
 
																-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
															
 
																-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
															
 
																 	unsigned int *foo = malloc(size*sizeof(*foo));
															
 
																 	for (i = 0; i < size; i++)
															
@@ -261,7 +260,7 @@ int main(int argc, char **argv)
 
																 #endif
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																 	ret = test_cuda();
															
 
																 	if (ret == 1)
															
 
																 		goto fail;
															
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2011-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2012 inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 
																 static int
															
 
																 find_a_worker(enum starpu_worker_archtype type)
															
 
																 {
															
 
																-	int worker;
															
 
																-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
															
 
																+	int worker[STARPU_NMAXWORKERS];
															
 
																+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
															
 
																 	if (ret == 0)
															
 
																 		return -ENODEV;
															
 
																-	return worker;
															
 
																+	if (ret == -ERANGE)
															
 
																+		return worker[STARPU_NMAXWORKERS-1];
															
 
																+	return worker[ret-1];
															
 
																 }
															
 
																 static int
															
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 
																 }
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#if CUDART_VERSION >= 4000
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																 static int
															
 
																 test_cuda(void)
															
 
																 {
															
@@ -100,8 +102,7 @@ test_cuda(void)
 
																 	size = 10 * n;
															
 
																 	devid = starpu_worker_get_devid(chosen);
															
 
																-	starpu_cuda_set_device(devid);
															
 
																-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
															
 
																+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
															
 
																 	foo = calloc(size, sizeof(*foo));
															
 
																 	for (i = 0; i < size; i++)
															
@@ -182,9 +183,7 @@ test_opencl(void)
 
																 	starpu_opencl_get_context(devid, &context);
															
 
																 	starpu_opencl_get_queue(devid, &queue);
															
 
																-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
															
 
																-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
															
 
																 	unsigned int *foo = malloc(size*sizeof(*foo));
															
 
																 	for (i = 0; i < size; i++)
															
@@ -269,7 +268,7 @@ int main(int argc, char **argv)
 
																 #endif
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																 	ret = test_cuda();
															
 
																 	if (ret == 1)
															
 
																 		goto fail;
															
--- a/tests/sched_policies/simple_cpu_gpu_sched.c
+++ b/tests/sched_policies/simple_cpu_gpu_sched.c
@@ -205,7 +205,18 @@ run(struct starpu_sched_policy *policy)
 
																 	if (cpu_task_worker != STARPU_CPU_WORKER ||
															
 
																 			(gpu_task_worker != STARPU_CUDA_WORKER &&
															
 
																 			 gpu_task_worker != STARPU_OPENCL_WORKER))
															
 
																+	{
															
 
																+		if (cpu_task_worker != STARPU_CPU_WORKER)
															
 
																+		{
															
 
																+			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
															
 
																+		}
															
 
																+		if (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER)
															
 
																+		{
															
 
																+			FPRINTF(stderr, "The GPU task did not run on a Cuda or OpenCL worker\n");
															
 
																+		}
															
 
																+
															
 
																 		ret = 1;
															
 
																+	}
															
 
																 	else
															
 
																 		ret = 0;