11 年之前 · 8a1120b2d1
--- a/Makefile.am
+++ b/Makefile.am
@@ -85,7 +85,8 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_stdlib.h			\
			
 
				 	include/starpu_thread.h			\
			
 
				 	include/starpu_thread_util.h		\
			
 
				-	include/starpu_tree.h
			
 
				+	include/starpu_tree.h			\
			
 
				+	include/starpu_simgrid_wrap.h
			
 
				 
			
 
				 nodist_versinclude_HEADERS = 			\
			
 
				 	include/starpu_config.h
			
--- a/configure.ac
+++ b/configure.ac
@@ -1820,6 +1820,8 @@ AC_SUBST(USE_MPI, $use_mpi)
 
				 AM_CONDITIONAL(USE_MPI, test x$use_mpi = xyes)
			
 
				 if test x$use_mpi = xyes; then
			
 
				 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
			
 
				+else
			
 
				+	running_mpi_check=no
			
 
				 fi
			
 
				 
			
 
				 AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
			
--- a/doc/doxygen/chapters/13offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/13offline_performance_tools.doxy
@@ -119,8 +119,9 @@ $ vite paje.trace
 
				 To get names of tasks instead of "unknown", fill the optional
			
 
				 starpu_codelet::name, or use a performance model for them.
			
 
				 Details of the codelet execution can be obtained by passing
			
 
				-<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
			
 
				-(at least r1430).
			
 
				+\ref enable-paje-codelet-details "--enable-paje-codelet-details" when
			
 
				+configuring StarPU and using a recent enough version of ViTE (at least
			
 
				+r1430).
			
 
				 
			
 
				 In the MPI execution case, collect the trace files from the MPI nodes, and
			
 
				 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
			
@@ -133,9 +134,10 @@ By default, all tasks are displayed using a green color. To display tasks with
 
				 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
			
 
				 
			
 
				 To identify tasks precisely, the application can set the ::tag_id field of the
			
 
				-tasks (or use STARPU_TAG_ONY when using starpu_task_insert), and with a recent
			
 
				-enough version of vite (>= r1430) and the <c>--enable-paje-codelet-details</c>
			
 
				-configure option, the value of the tag will show up in the trace.
			
 
				+tasks (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
			
 
				+enough version of vite (>= r1430) and the
			
 
				+\ref enable-paje-codelet-details "--enable-paje-codelet-details"
			
 
				+StarPU configure option, the value of the tag will show up in the trace.
			
 
				 
			
 
				 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
			
 
				 
			
--- a/doc/doxygen/chapters/21simgrid.doxy
+++ b/doc/doxygen/chapters/21simgrid.doxy
@@ -22,9 +22,9 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 
				 virtual timestamp in us.
			
 
				 
			
 
				 For some technical reason, the application's .c file which contains main() has
			
 
				-to be recompiled with starpu.h, which in the simgrid case will # define main()
			
 
				+to be recompiled with starpu_simgrid_wrap.h, which in the simgrid case will # define main()
			
 
				 into starpu_main(), and it is libstarpu which will provide the real main() and
			
 
				-call the application's main().
			
 
				+will call the application's main().
			
 
				 
			
 
				 To be able to test with crazy data sizes, one may want to only allocate
			
 
				 application data if STARPU_SIMGRID is not defined.  Passing a NULL pointer to
			
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -57,8 +57,9 @@ case of e.g. available particular CUDA or OpenCL support.
 
				 \ingroup API_Data_Interfaces
			
 
				 \var starpu_data_copy_methods::can_copy
			
 
				 If defined, allows the interface to declare whether it supports transferring
			
 
				-from \p src_interface on node \p src_node to \p dst_interface on node \p. If not
			
 
				-defined, it is assumed that the interface supports all transfers.
			
 
				+from \p src_interface on node \p src_node to \p dst_interface on node \p
			
 
				+dst_node, run from node \p handling_node. If not defined, it is assumed that the
			
 
				+interface supports all transfers.
			
 
				 \var starpu_data_copy_methods::ram_to_ram
			
 
				 Define how to copy data from the \p src_interface interface on the \p
			
 
				 src_node CPU node to the \p dst_interface interface on the \p dst_node
			
@@ -1000,11 +1001,12 @@ DefiningANewDataInterface.
 
				 \ingroup API_Data_Interfaces
			
 
				 Allocate \p size bytes on node \p dst_node. This returns 0 if
			
 
				 allocation failed, the allocation method should then return <c>-ENOMEM</c> as
			
 
				-allocated size.
			
 
				+allocated size. Deallocation must be done with starpu_free_on_node.
			
 
				 
			
 
				 \fn void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
			
 
				 \ingroup API_Data_Interfaces
			
 
				-Free \p addr of \p size bytes on node \p dst_node.
			
 
				+Free \p addr of \p size bytes on node \p dst_node which was previously allocated
			
 
				+with starpu_malloc_on_node.
			
 
				 
			
 
				 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
			
 
				 \ingroup API_Data_Interfaces
			
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011, 2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011, 2013-2014  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -47,6 +47,7 @@ static struct starpu_codelet cl =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	/* CUDA implementation of the codelet */
			
 
				 	.cuda_funcs = {scal_cuda_func, NULL},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &vector_scal_model
			
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -425,7 +425,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 
				 	size = src_custom->nx * 2 * sizeof(float);
			
 
				 	if (dst_custom->cpu_ptr == NULL)
			
 
				 	{
			
 
				-		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
			
 
				+		ret = starpu_opencl_allocate_memory(devid, (cl_mem*)&dst_custom->cpu_ptr,
			
 
				 				size, CL_MEM_READ_WRITE);
			
 
				 		assert(ret == CL_SUCCESS);
			
 
				 	}
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -58,6 +58,7 @@ static void parse_args(int argc, char **argv)
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 		else if (strcmp(argv[i], "-check") == 0)
			
 
				 		{
			
 
				 			check = 1;
			
@@ -72,6 +73,7 @@ static void parse_args(int argc, char **argv)
 
				 		{
			
 
				 			no_stride = 1;
			
 
				 		}
			
 
				+#endif
			
 
				 
			
 
				 		else if (strcmp(argv[i], "-profile") == 0)
			
 
				 		{
			
@@ -315,6 +317,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	init_matrix();
			
 
				 
			
 
				 	unsigned *ipiv = NULL;
			
@@ -364,6 +367,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 	else
			
 
				+#endif
			
 
				 	{
			
 
				 		ret = STARPU_LU(lu_decomposition)(A, size, size, nblocks);
			
 
				 	}
			
@@ -403,6 +407,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	if (check)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "Checking result\n");
			
@@ -413,6 +418,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 		check_result();
			
 
				 	}
			
 
				+#endif
			
 
				 
			
 
				 	starpu_free(A);
			
 
				 
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -66,18 +66,13 @@ typedef UINT_PTR uintptr_t;
 
				 #include <starpu_fxt.h>
			
 
				 #include <starpu_driver.h>
			
 
				 #include <starpu_tree.h>
			
 
				+#include <starpu_simgrid_wrap.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 extern "C"
			
 
				 {
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-#ifndef main
			
 
				-#define main starpu_main
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				 struct starpu_conf
			
 
				 {
			
 
				 	int magic;
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -37,7 +37,7 @@ extern "C"
 
				 
			
 
				 struct starpu_data_copy_methods
			
 
				 {
			
 
				-	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node);
			
 
				 
			
 
				 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -76,7 +76,7 @@ int starpu_opencl_collect_stats(cl_event event);
 
				 
			
 
				 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
			
 
				 
			
 
				-cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
			
 
				+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *addr, size_t size, cl_mem_flags flags);
			
 
				 
			
 
				 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
			
 
				 
			
--- a/include/starpu_simgrid_wrap.h
+++ b/include/starpu_simgrid_wrap.h
@@ -0,0 +1,28 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2014  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_SIMGRID_WRAP_H__
			
 
				+#define __STARPU_SIMGRID_WRAP_H__
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+#ifndef main
			
 
				+#define main starpu_main
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_SIMGRID_WRAP_H__ */
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -173,12 +173,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
				 
			
 
				 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
			
 
				 
			
 
				-	 TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
			
 
				+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
			
 
				 
			
 
				 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
			
 
				 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
			
 
				 
			
 
				-	 TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
			
 
				+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
			
 
				 
			
 
				 	 /* somebody is perhaps waiting for the MPI request to be posted */
			
 
				 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
			
@@ -304,12 +304,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
				 
			
 
				 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				 
			
 
				-	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
			
 
				+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
			
 
				 
			
 
				 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
			
 
				 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
			
 
				 
			
 
				-	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
			
 
				+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
			
 
				 
			
 
				 	/* somebody is perhaps waiting for the MPI request to be posted */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
			
@@ -413,12 +413,12 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 
				 	/* Which is the mpi request we are waiting for ? */
			
 
				 	struct _starpu_mpi_req *req = waiting_req->other_request;
			
 
				 
			
 
				-	TRACE_MPI_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
			
 
				+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
			
 
				 
			
 
				 	req->ret = MPI_Wait(&req->request, waiting_req->status);
			
 
				 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
			
 
				 
			
 
				-	TRACE_MPI_UWAIT_END(req->srcdst, req->mpi_tag);
			
 
				+	_STARPU_MPI_TRACE_UWAIT_END(req->srcdst, req->mpi_tag);
			
 
				 
			
 
				 	_starpu_mpi_handle_request_termination(req);
			
 
				 	_STARPU_MPI_LOG_OUT();
			
@@ -481,12 +481,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
				 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
			
 
				 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				 
			
 
				-	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
			
 
				+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
			
 
				 
			
 
				 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
			
 
				 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
			
 
				 
			
 
				-	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
			
 
				+	_STARPU_MPI_TRACE_UTESTING_END(req->srcdst, req->mpi_tag);
			
 
				 
			
 
				 	if (*testing_req->flag)
			
 
				 	{
			
@@ -933,22 +933,22 @@ static void _starpu_mpi_test_detached_requests(void)
 
				 		{
			
 
				 			if (req->request_type == RECV_REQ)
			
 
				 			{
			
 
				-				TRACE_MPI_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
			
 
				+				_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
			
 
				 			}
			
 
				 			else if (req->request_type == SEND_REQ)
			
 
				 			{
			
 
				-				TRACE_MPI_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
			
 
				+				_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
			
 
				 			}
			
 
				 
			
 
				 			_starpu_mpi_handle_request_termination(req);
			
 
				 
			
 
				 			if (req->request_type == RECV_REQ)
			
 
				 			{
			
 
				-				TRACE_MPI_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
			
 
				+				_STARPU_MPI_TRACE_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
			
 
				 			}
			
 
				 			else if (req->request_type == SEND_REQ)
			
 
				 			{
			
 
				-				TRACE_MPI_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
			
 
				+				_STARPU_MPI_TRACE_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
			
 
				 			}
			
 
				 		}
			
 
				 
			
@@ -1057,7 +1057,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
			
 
				 
			
 
				 	{
			
 
				-		TRACE_MPI_START(rank, worldsize);
			
 
				+		_STARPU_MPI_TRACE_START(rank, worldsize);
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		starpu_profiling_set_id(rank);
			
 
				 #endif //STARPU_USE_FXT
			
@@ -1097,14 +1097,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
			
 
				 
			
 
				-			TRACE_MPI_SLEEP_BEGIN();
			
 
				+			_STARPU_MPI_TRACE_SLEEP_BEGIN();
			
 
				 
			
 
				 			if (barrier_running)
			
 
				 				/* Tell mpi_barrier */
			
 
				 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
			
 
				 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
			
 
				 
			
 
				-			TRACE_MPI_SLEEP_END();
			
 
				+			_STARPU_MPI_TRACE_SLEEP_END();
			
 
				 		}
			
 
				 
			
 
				 		/* get one request */
			
@@ -1323,7 +1323,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
				 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
			
 
				 
			
 
				-	TRACE_MPI_BARRIER(rank, worldsize, random_number);
			
 
				+	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
			
 
				 
			
 
				 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
			
 
				 #endif
			
@@ -1407,7 +1407,7 @@ int starpu_mpi_shutdown(void)
 
				 	starpu_progression_hook_deregister(hookid);
			
 
				 #endif /* STARPU_MPI_ACTIVITY */
			
 
				 
			
 
				-	TRACE_MPI_STOP(rank, world_size);
			
 
				+	_STARPU_MPI_TRACE_STOP(rank, world_size);
			
 
				 
			
 
				 	/* free the request queues */
			
 
				 	_starpu_mpi_req_list_delete(detached_requests);
			
--- a/mpi/src/starpu_mpi_fxt.h
+++ b/mpi/src/starpu_mpi_fxt.h
@@ -26,86 +26,86 @@
 
				 extern "C" {
			
 
				 #endif
			
 
				 
			
 
				-#define FUT_MPI_START				0x5201
			
 
				-#define FUT_MPI_STOP				0x5202
			
 
				-#define FUT_MPI_BARRIER				0x5203
			
 
				-#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
			
 
				-#define FUT_MPI_ISEND_SUBMIT_END		0x5205
			
 
				-#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
			
 
				-#define FUT_MPI_IRECV_SUBMIT_END		0x5207
			
 
				-#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
			
 
				-#define FUT_MPI_ISEND_COMPLETE_END		0x5209
			
 
				-#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
			
 
				-#define FUT_MPI_IRECV_COMPLETE_END		0x5211
			
 
				-#define FUT_MPI_SLEEP_BEGIN			0x5212
			
 
				-#define FUT_MPI_SLEEP_END			0x5213
			
 
				-#define FUT_MPI_DTESTING_BEGIN			0x5214
			
 
				-#define FUT_MPI_DTESTING_END			0x5215
			
 
				-#define FUT_MPI_UTESTING_BEGIN			0x5216
			
 
				-#define FUT_MPI_UTESTING_END			0x5217
			
 
				-#define FUT_MPI_UWAIT_BEGIN			0x5218
			
 
				-#define FUT_MPI_UWAIT_END			0x5219
			
 
				+#define _STARPU_MPI_FUT_START				0x5201
			
 
				+#define _STARPU_MPI_FUT_STOP				0x5202
			
 
				+#define _STARPU_MPI_FUT_BARRIER				0x5203
			
 
				+#define _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN		0x5204
			
 
				+#define _STARPU_MPI_FUT_ISEND_SUBMIT_END		0x5205
			
 
				+#define _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN		0x5206
			
 
				+#define _STARPU_MPI_FUT_IRECV_SUBMIT_END		0x5207
			
 
				+#define _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN		0x5208
			
 
				+#define _STARPU_MPI_FUT_ISEND_COMPLETE_END		0x5209
			
 
				+#define _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN		0x5210
			
 
				+#define _STARPU_MPI_FUT_IRECV_COMPLETE_END		0x5211
			
 
				+#define _STARPU_MPI_FUT_SLEEP_BEGIN			0x5212
			
 
				+#define _STARPU_MPI_FUT_SLEEP_END			0x5213
			
 
				+#define _STARPU_MPI_FUT_DTESTING_BEGIN			0x5214
			
 
				+#define _STARPU_MPI_FUT_DTESTING_END			0x5215
			
 
				+#define _STARPU_MPI_FUT_UTESTING_BEGIN			0x5216
			
 
				+#define _STARPU_MPI_FUT_UTESTING_END			0x5217
			
 
				+#define _STARPU_MPI_FUT_UWAIT_BEGIN			0x5218
			
 
				+#define _STARPU_MPI_FUT_UWAIT_END			0x5219
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-#define TRACE_MPI_START(rank, worldsize)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
			
 
				-#define TRACE_MPI_STOP(rank, worldsize)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
			
 
				-#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
			
 
				-	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
			
 
				-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
			
 
				-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				-#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
			
 
				-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
			
 
				-#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
			
 
				-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
			
 
				-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				-#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
			
 
				-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
			
 
				-#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
			
 
				-#define TRACE_MPI_SLEEP_BEGIN()	\
			
 
				-	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
			
 
				-#define TRACE_MPI_SLEEP_END()	\
			
 
				-	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
			
 
				-#define TRACE_MPI_DTESTING_BEGIN()	\
			
 
				-	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
			
 
				-#define TRACE_MPI_DTESTING_END()	\
			
 
				-	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
			
 
				-#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
			
 
				-#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
			
 
				-#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
			
 
				-#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
			
 
				-	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_START(rank, worldsize)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	\
			
 
				+	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
			
 
				+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_SLEEP_END()	\
			
 
				+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
			
 
				+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_DTESTING_END()	\
			
 
				+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_UTESTING_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
			
 
				 #define TRACE
			
 
				 #else
			
 
				-#define TRACE_MPI_START(a, b)				do {} while(0);
			
 
				-#define TRACE_MPI_STOP(a, b)				do {} while(0);
			
 
				-#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
			
 
				-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
			
 
				-#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
			
 
				-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
			
 
				-#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
			
 
				-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
			
 
				-#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
			
 
				-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
			
 
				-#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
			
 
				-#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
			
 
				-#define TRACE_MPI_SLEEP_END()				do {} while(0);
			
 
				-#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
			
 
				-#define TRACE_MPI_DTESTING_END()			do {} while(0);
			
 
				-#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
			
 
				-#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
			
 
				-#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
			
 
				-#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()				do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_SLEEP_END()				do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_DTESTING_END()			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(a, b)			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/src/common/starpu_spinlock.c
+++ b/src/common/starpu_spinlock.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -24,15 +24,17 @@
 
				 int _starpu_spin_init(struct _starpu_spinlock *lock)
			
 
				 {
			
 
				 #if defined(STARPU_SPINLOCK_CHECK)
			
 
				+	starpu_pthread_mutexattr_t errcheck_attr;
			
 
				 //	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
			
 
				 	int ret;
			
 
				-	ret = starpu_pthread_mutexattr_init(&lock->errcheck_attr);
			
 
				+	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
			
 
				 
			
 
				-	ret = starpu_pthread_mutexattr_settype(&lock->errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
			
 
				+	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
			
 
				 	STARPU_ASSERT(!ret);
			
 
				 
			
 
				-	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &lock->errcheck_attr);
			
 
				+	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &errcheck_attr);
			
 
				+	starpu_pthread_mutexattr_destroy(&errcheck_attr);
			
 
				 	return ret;
			
 
				 #else
			
 
				 	int ret = starpu_pthread_spin_init(&lock->lock, 0);
			
@@ -44,7 +46,6 @@ int _starpu_spin_init(struct _starpu_spinlock *lock)
 
				 int _starpu_spin_destroy(struct _starpu_spinlock *lock STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #if defined(STARPU_SPINLOCK_CHECK)
			
 
				-	starpu_pthread_mutexattr_destroy(&lock->errcheck_attr);
			
 
				 	return starpu_pthread_mutex_destroy(&lock->errcheck_lock);
			
 
				 #else
			
 
				 	return starpu_pthread_spin_destroy(&lock->lock);
			
--- a/src/common/starpu_spinlock.h
+++ b/src/common/starpu_spinlock.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,7 +25,6 @@
 
				 struct _starpu_spinlock
			
 
				 {
			
 
				 #if defined(STARPU_SPINLOCK_CHECK)
			
 
				-	starpu_pthread_mutexattr_t errcheck_attr;
			
 
				 	starpu_pthread_mutex_t errcheck_lock;
			
 
				 	const char *last_taker;
			
 
				 #else
			
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -83,10 +83,24 @@ int _starpu_mkpath(const char *s, mode_t mode)
 
				 	if ((_starpu_mkpath(up, mode) == -1) && (errno != EEXIST))
			
 
				 		goto out;
			
 
				 
			
 
				-	if ((mkdir(path, mode) == -1) && (errno != EEXIST))
			
 
				-		rv = -1;
			
 
				-	else
			
 
				+	struct stat sb;
			
 
				+	if (stat(path, &sb) == 0)
			
 
				+	{
			
 
				+		if (!S_ISDIR(sb.st_mode))
			
 
				+		{
			
 
				+			fprintf(stderr,"Error: %s is not a directory:\n", path);
			
 
				+			STARPU_ABORT();
			
 
				+		}
			
 
				+		/* It already exists and is a directory.  */
			
 
				 		rv = 0;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		if ((mkdir(path, mode) == -1) && (errno != EEXIST))
			
 
				+			rv = -1;
			
 
				+		else
			
 
				+			rv = 0;
			
 
				+	}
			
 
				 
			
 
				 out:
			
 
				 	olderrno = errno;
			
@@ -105,23 +119,11 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 
				 
			
 
				 	ret = _starpu_mkpath(path, mode);
			
 
				 
			
 
				-	if (ret == -1)
			
 
				+	if (ret == -1 && errno != EEXIST)
			
 
				 	{
			
 
				-		if (errno != EEXIST)
			
 
				-		{
			
 
				-			fprintf(stderr,"Error making StarPU directory %s:\n", path);
			
 
				-			perror("mkdir");
			
 
				-			STARPU_ABORT();
			
 
				-		}
			
 
				-
			
 
				-		/* make sure that it is actually a directory */
			
 
				-		struct stat sb;
			
 
				-		stat(path, &sb);
			
 
				-		if (!S_ISDIR(sb.st_mode))
			
 
				-		{
			
 
				-			fprintf(stderr,"Error: %s is not a directory:\n", path);
			
 
				-			STARPU_ABORT();
			
 
				-		}
			
 
				+		fprintf(stderr,"Error making StarPU directory %s:\n", path);
			
 
				+		perror("mkdir");
			
 
				+		STARPU_ABORT();
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1230,7 +1230,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 			char archname[32];
			
 
				 
			
 
				 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				-			_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
			
 
				+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry && entry->history_entry ? entry->history_entry->nsample : 0);
			
 
				 			_starpu_set_calibrate_flag(1);
			
 
				 			model->benchmarking = 1;
			
 
				 		}
			
@@ -1272,7 +1272,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
				 		char archname[32];
			
 
				 
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
			
 
				+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry ? entry->nsample : 0);
			
 
				 		_starpu_set_calibrate_flag(1);
			
 
				 		model->benchmarking = 1;
			
 
				 	}
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -841,6 +841,7 @@ pick:
 
				 	 * We do have a task that uses multiformat handles. Let's create the
			
 
				 	 * required conversion tasks.
			
 
				 	 */
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
			
 
				 	unsigned i;
			
 
				 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				 	for (i = 0; i < nbuffers; i++)
			
@@ -864,6 +865,7 @@ pick:
 
				 
			
 
				 	task->mf_skip = 1;
			
 
				 	starpu_task_list_push_back(&worker->local_tasks, task);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
			
 
				 	goto pick;
			
 
				 
			
 
				 profiling:
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -27,6 +27,7 @@
 
				 #ifdef STARPU_SIMGRID
			
 
				 #include <msg/msg.h>
			
 
				 #include <smpi/smpif.h>
			
 
				+#include <sys/resource.h>
			
 
				 
			
 
				 #define STARPU_MPI_AS_PREFIX "StarPU-MPI"
			
 
				 
			
@@ -160,7 +161,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
			
 
				 	{
			
 
				-		_STARPU_ERROR("The main file of this application needs to be compiled with starpu.h included, to properly define starpu_main\n");
			
 
				+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
			
 
				 		exit(EXIT_FAILURE);
			
 
				 	}
			
 
				 
			
@@ -178,7 +179,12 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 	/* Simgrid uses tiny stacks by default.  This comes unexpected to our users.  */
			
 
				 	extern xbt_cfg_t _sg_cfg_set;
			
 
				-	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", 8192);
			
 
				+	unsigned stack_size = 8192;
			
 
				+	struct rlimit rlim;
			
 
				+	if (getrlimit(RLIMIT_STACK, &rlim) == 0 && rlim.rlim_cur != 0 && rlim.rlim_cur != RLIM_INFINITY)
			
 
				+		stack_size = rlim.rlim_cur / 1024;
			
 
				+
			
 
				+	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", stack_size);
			
 
				 
			
 
				 	/* Load XML platform */
			
 
				 	_starpu_simgrid_get_platform_path(path, sizeof(path));
			
@@ -196,6 +202,12 @@ void _starpu_simgrid_init()
 
				 	xbt_dynar_t hosts;
			
 
				 	int i;
			
 
				 
			
 
				+	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
			
 
				+	{
			
 
				+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				 #ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
			
 
				 	if (_starpu_simgrid_running_smpi())
			
 
				 	{
			
@@ -253,6 +265,10 @@ void _starpu_simgrid_init()
 
				 	xbt_dynar_free(&hosts);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Tasks
			
 
				+ */
			
 
				+
			
 
				 /* Task execution submitted by StarPU */
			
 
				 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
			
 
				 {
			
@@ -276,8 +292,13 @@ void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_
 
				 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
			
 
				 			0, NULL);
			
 
				 	MSG_task_execute(simgrid_task);
			
 
				+	MSG_task_destroy(simgrid_task);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Transfers
			
 
				+ */
			
 
				+
			
 
				 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
			
 
				 LIST_TYPE(transfer,
			
 
				 	msg_task_t task;
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1218,16 +1218,17 @@ out:
 
				 			STARPU_ASSERT(worker->local_ordered_tasks[n] == NULL);
			
 
				 		_starpu_sched_ctx_list_delete(&worker->sched_ctx_list);
			
 
				 		_starpu_job_list_delete(worker->terminated_jobs);
			
 
				+		free(worker->local_ordered_tasks);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 /* Condition variable and mutex used to pause/resume. */
			
 
				 static starpu_pthread_cond_t pause_cond = STARPU_PTHREAD_COND_INITIALIZER;
			
 
				 static starpu_pthread_mutex_t pause_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				-unsigned _starpu_machine_is_running(void)
			
 
				+
			
 
				+void _starpu_may_pause(void)
			
 
				 {
			
 
				-	unsigned ret;
			
 
				-	/* running and pause_depth are just protected by a memory barrier */
			
 
				+	/* pause_depth is just protected by a memory barrier */
			
 
				 	STARPU_RMB();
			
 
				 
			
 
				 	if (STARPU_UNLIKELY(config.pause_depth > 0)) {
			
@@ -1237,6 +1238,13 @@ unsigned _starpu_machine_is_running(void)
 
				 		}
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
			
 
				 	}
			
 
				+}
			
 
				+
			
 
				+unsigned _starpu_machine_is_running(void)
			
 
				+{
			
 
				+	unsigned ret;
			
 
				+	/* running is just protected by a memory barrier */
			
 
				+	STARPU_RMB();
			
 
				 
			
 
				 	ANNOTATE_HAPPENS_AFTER(&config.running);
			
 
				 	ret = config.running;
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -354,6 +354,9 @@ char ***_starpu_get_argv();
 
				 /* Fill conf with environment variables */
			
 
				 void _starpu_conf_check_environment(struct starpu_conf *conf);
			
 
				 
			
 
				+/* Called by the driver when it is ready to pause  */
			
 
				+void _starpu_may_pause(void);
			
 
				+
			
 
				 /* Has starpu_shutdown already been called ? */
			
 
				 unsigned _starpu_machine_is_running(void);
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -41,8 +41,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
				 	double cost = INFINITY;
			
 
				 	unsigned src_node_mask = 0;
			
 
				 
			
 
				-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
			
 
				-
			
 
				 	for (node = 0; node < nnodes; node++)
			
 
				 	{
			
 
				 		if (handle->per_node[node].state != STARPU_INVALID)
			
@@ -74,15 +72,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
				 				double time = starpu_transfer_predict(i, destination, size);
			
 
				 				unsigned handling_node;
			
 
				 
			
 
				-				/* Avoid transfers which the interface does not want */
			
 
				-				if (copy_methods->can_copy)
			
 
				-				{
			
 
				-					void *src_interface = handle->per_node[i].data_interface;
			
 
				-					void *dst_interface = handle->per_node[destination].data_interface;
			
 
				-					if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
			
 
				-						continue;
			
 
				-				}
			
 
				-
			
 
				 				/* Avoid indirect transfers */
			
 
				 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
			
 
				 					continue;
			
@@ -115,22 +104,22 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
				 		
			
 
				 		if (src_node_mask & (1<<i))
			
 
				 		{
			
 
				+			int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
			
 
				 			/* Avoid transfers which the interface does not want */
			
 
				-			if (copy_methods->can_copy)
			
 
				+			if (can_copy)
			
 
				 			{
			
 
				 				void *src_interface = handle->per_node[i].data_interface;
			
 
				 				void *dst_interface = handle->per_node[destination].data_interface;
			
 
				 				unsigned handling_node;
			
 
				 
			
 
				-				if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
			
 
				-					continue;
			
 
				-
			
 
				 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
			
 
				 				{
			
 
				 					/* Avoid through RAM if the interface does not want it */
			
 
				 					void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
			
 
				-					if (!copy_methods->can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM)
			
 
				-					 || !copy_methods->can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination))
			
 
				+					if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
			
 
				+					  && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
			
 
				+					 || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
			
 
				+					  && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
			
 
				 						continue;
			
 
				 				}
			
 
				 			}
			
@@ -251,7 +240,9 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 
			
 
				 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
			
 
				 {
			
 
				-	(void) handle; // unused
			
 
				+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
			
 
				+	void *src_interface = handle->per_node[src_node].data_interface;
			
 
				+	void *dst_interface = handle->per_node[dst_node].data_interface;
			
 
				 
			
 
				 	/* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
			
 
				 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
			
@@ -260,19 +251,19 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 
				 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				 	{
			
 
				 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
			
 
				-		if (!copy_methods->cuda_to_cuda_async)
			
 
				+		if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
			
 
				 			return 0;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	/* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
			
 
				-	if (worker_supports_direct_access(src_node, dst_node))
			
 
				+	if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
			
 
				 	{
			
 
				 		*handling_node = dst_node;
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
 
				-	if (worker_supports_direct_access(dst_node, src_node))
			
 
				+	if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
			
 
				 	{
			
 
				 		*handling_node = src_node;
			
 
				 		return 1;
			
@@ -319,6 +310,10 @@ static int determine_request_path(starpu_data_handle_t handle,
 
				 
			
 
				 	if (!link_is_valid)
			
 
				 	{
			
 
				+		int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
			
 
				+		void *src_interface = handle->per_node[src_node].data_interface;
			
 
				+		void *dst_interface = handle->per_node[dst_node].data_interface;
			
 
				+
			
 
				 		/* We need an intermediate hop to implement data staging
			
 
				 		 * through main memory. */
			
 
				 		STARPU_ASSERT(max_len >= 2);
			
@@ -326,12 +321,36 @@ static int determine_request_path(starpu_data_handle_t handle,
 
				 		/* GPU -> RAM */
			
 
				 		src_nodes[0] = src_node;
			
 
				 		dst_nodes[0] = STARPU_MAIN_RAM;
			
 
				-		handling_nodes[0] = starpu_node_get_kind(src_node) == STARPU_DISK_RAM ? dst_node : src_node;
			
 
				+
			
 
				+		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
			
 
				+			/* Disks don't have their own driver thread */
			
 
				+			handling_nodes[0] = dst_node;
			
 
				+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
			
 
				+		{
			
 
				+			handling_nodes[0] = src_node;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
			
 
				+			handling_nodes[0] = dst_node;
			
 
				+		}
			
 
				 
			
 
				 		/* RAM -> GPU */
			
 
				 		src_nodes[1] = STARPU_MAIN_RAM;
			
 
				 		dst_nodes[1] = dst_node;
			
 
				-		handling_nodes[1] = starpu_node_get_kind(dst_node) == STARPU_DISK_RAM ? src_node : dst_node;
			
 
				+
			
 
				+		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
			
 
				+			/* Disks don't have their own driver thread */
			
 
				+			handling_nodes[1] = src_node;
			
 
				+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
			
 
				+		{
			
 
				+			handling_nodes[1] = dst_node;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
			
 
				+			handling_nodes[1] = src_node;
			
 
				+		}
			
 
				 
			
 
				 		return 2;
			
 
				 	}
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -155,17 +155,22 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 
				 {
			
 
				 	int retval;
			
 
				 	int do_delete = 0;
			
 
				+	int completed;
			
 
				 
			
 
				 	unsigned local_node = _starpu_memory_node_get_local_key();
			
 
				 
			
 
				 	do
			
 
				 	{
			
 
				-		_starpu_spin_lock(&r->lock);
			
 
				-
			
 
				-		if (r->completed)
			
 
				-			break;
			
 
				-
			
 
				-		_starpu_spin_unlock(&r->lock);
			
 
				+		STARPU_HG_DISABLE_CHECKING(r->completed);
			
 
				+		completed = r->completed;
			
 
				+		STARPU_HG_ENABLE_CHECKING(r->completed);
			
 
				+		if (completed)
			
 
				+		{
			
 
				+			_starpu_spin_lock(&r->lock);
			
 
				+			if (r->completed)
			
 
				+				break;
			
 
				+			_starpu_spin_unlock(&r->lock);
			
 
				+		}
			
 
				 
			
 
				 #ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				 		_starpu_wake_all_blocked_workers_on_node(r->handling_node);
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -50,7 +50,7 @@ struct malloc_pinned_codelet_struct
 
				 //{
			
 
				 //	struct malloc_pinned_codelet_struct *s = arg;
			
 
				 //        //        *(s->ptr) = malloc(s->dim);
			
 
				-//        starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
			
 
				+//        starpu_opencl_allocate_memory(devid, (void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
			
 
				 //}
			
 
				 //#endif
			
 
				 
			
@@ -404,6 +404,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
				 			STARPU_ASSERT(last[dst_node] >= addr);
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
			
 
				 #else
			
 
				+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
			
 
				+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
			
 
				+#if defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+				starpu_cuda_set_device(devid);
			
 
				+#else
			
 
				+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
			
 
				+#endif
			
 
				 			status = cudaMalloc((void **)&addr, size);
			
 
				 			if (!addr || (status != cudaSuccess))
			
 
				 			{
			
@@ -433,7 +441,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
				                                 int ret;
			
 
				 				cl_mem ptr;
			
 
				 
			
 
				-				ret = starpu_opencl_allocate_memory(&ptr, size, CL_MEM_READ_WRITE);
			
 
				+				ret = starpu_opencl_allocate_memory(_starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
			
 
				 				if (ret)
			
 
				 				{
			
 
				 					addr = 0;
			
@@ -505,6 +513,14 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
			
 
				 #else
			
 
				 			cudaError_t err;
			
 
				+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
			
 
				+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
			
 
				+#if defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+				starpu_cuda_set_device(devid);
			
 
				+#else
			
 
				+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
			
 
				+#endif
			
 
				 			err = cudaFree((void*)addr);
			
 
				 			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				 				STARPU_CUDA_REPORT_ERROR(err);
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -440,19 +440,24 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
				 
			
 
				 	if (!old_replicate)
			
 
				 	{
			
 
				+		/* Free the copy that we made */
			
 
				 		free(mc->chunk_interface);
			
 
				 		mc->chunk_interface = NULL;
			
 
				 	}
			
 
				 
			
 
				-	mc->data = new_replicate->handle;
			
 
				-	/* mc->ops, mc->footprint and mc->interface should be
			
 
				+	/* XXX: We do not actually reuse the mc at the moment, only the interface */
			
 
				+
			
 
				+	/* mc->data = new_replicate->handle; */
			
 
				+	/* mc->footprint, mc->ops, mc->size_interface, mc->automatically_allocated should be
			
 
				  	 * unchanged ! */
			
 
				 
			
 
				-	/* reinsert the mem chunk in the list of active memory chunks */
			
 
				-	if (!is_already_in_mc_list)
			
 
				+	/* remove the mem chunk from the list of active memory chunks, register_mem_chunk will put it back later */
			
 
				+	if (is_already_in_mc_list)
			
 
				 	{
			
 
				-		_starpu_mem_chunk_list_push_back(mc_list[node], mc);
			
 
				+		_starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				 	}
			
 
				+
			
 
				+	free(mc);
			
 
				 }
			
 
				 
			
 
				 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -1893,79 +1893,79 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
				 				handle_user_event(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_START:
			
 
				+			case _STARPU_MPI_FUT_START:
			
 
				 				handle_mpi_start(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_STOP:
			
 
				+			case _STARPU_MPI_FUT_STOP:
			
 
				 				handle_mpi_stop(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_BARRIER:
			
 
				+			case _STARPU_MPI_FUT_BARRIER:
			
 
				 				handle_mpi_barrier(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_ISEND_SUBMIT_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN:
			
 
				 				handle_mpi_isend_submit_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_ISEND_SUBMIT_END:
			
 
				+			case _STARPU_MPI_FUT_ISEND_SUBMIT_END:
			
 
				 				handle_mpi_isend_submit_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_IRECV_SUBMIT_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN:
			
 
				 				handle_mpi_irecv_submit_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_IRECV_SUBMIT_END:
			
 
				+			case _STARPU_MPI_FUT_IRECV_SUBMIT_END:
			
 
				 				handle_mpi_irecv_submit_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_ISEND_COMPLETE_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN:
			
 
				 				handle_mpi_isend_complete_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_ISEND_COMPLETE_END:
			
 
				+			case _STARPU_MPI_FUT_ISEND_COMPLETE_END:
			
 
				 				handle_mpi_isend_complete_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_IRECV_COMPLETE_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN:
			
 
				 				handle_mpi_irecv_complete_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_IRECV_COMPLETE_END:
			
 
				+			case _STARPU_MPI_FUT_IRECV_COMPLETE_END:
			
 
				 				handle_mpi_irecv_complete_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_SLEEP_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_SLEEP_BEGIN:
			
 
				 				handle_mpi_sleep_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_SLEEP_END:
			
 
				+			case _STARPU_MPI_FUT_SLEEP_END:
			
 
				 				handle_mpi_sleep_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_DTESTING_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_DTESTING_BEGIN:
			
 
				 				handle_mpi_dtesting_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_DTESTING_END:
			
 
				+			case _STARPU_MPI_FUT_DTESTING_END:
			
 
				 				handle_mpi_dtesting_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_UTESTING_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_UTESTING_BEGIN:
			
 
				 				handle_mpi_utesting_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_UTESTING_END:
			
 
				+			case _STARPU_MPI_FUT_UTESTING_END:
			
 
				 				handle_mpi_utesting_end(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_UWAIT_BEGIN:
			
 
				+			case _STARPU_MPI_FUT_UWAIT_BEGIN:
			
 
				 				handle_mpi_uwait_begin(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				-			case FUT_MPI_UWAIT_END:
			
 
				+			case _STARPU_MPI_FUT_UWAIT_END:
			
 
				 				handle_mpi_uwait_end(&ev, options);
			
 
				 				break;
			
 
				 
			
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -74,7 +74,7 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		if (ev.code == FUT_MPI_BARRIER)
			
 
				+		if (ev.code == _STARPU_MPI_FUT_BARRIER)
			
 
				 		{
			
 
				 			/* We found the sync point */
			
 
				 			*offset = ev.time;
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -301,7 +301,10 @@ _starpu_cpu_worker(void *arg)
 
				 
			
 
				 	_starpu_cpu_driver_init(args);
			
 
				 	while (_starpu_machine_is_running())
			
 
				+	{
			
 
				+		_starpu_may_pause();
			
 
				 		_starpu_cpu_driver_run_once(args);
			
 
				+	}
			
 
				 	_starpu_cpu_driver_deinit(args);
			
 
				 
			
 
				 	return NULL;
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -567,6 +567,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
				 		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
			
 
				 
			
 
				 		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
			
 
				+		if (worker->pipeline_length > STARPU_MAX_PIPELINE)
			
 
				+		{
			
 
				+			_STARPU_DISP("Warning: STARPU_CUDA_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
			
 
				+			worker->pipeline_length = STARPU_MAX_PIPELINE;
			
 
				+		}
			
 
				 		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
			
 
				 	}
			
 
				 
			
@@ -752,7 +757,10 @@ void *_starpu_cuda_worker(void *_arg)
 
				 	_starpu_cuda_driver_init(worker);
			
 
				 	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 	while (_starpu_machine_is_running())
			
 
				+	{
			
 
				+		_starpu_may_pause();
			
 
				 		_starpu_cuda_driver_run_once(worker);
			
 
				+	}
			
 
				 	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 	_starpu_cuda_driver_deinit(worker);
			
 
				 
			
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -343,6 +343,7 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 
				 
			
 
				 	while(_starpu_machine_is_running())
			
 
				 	{
			
 
				+		_starpu_may_pause();
			
 
				 		if (gordon_busy_enough())
			
 
				 		{
			
 
				 			/* gordon already has enough work, wait a little TODO */
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -683,6 +683,8 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
 
				 		int res;
			
 
				 		struct _starpu_job * j;
			
 
				 
			
 
				+		_starpu_may_pause();
			
 
				+
			
 
				 		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 		_starpu_datawizard_progress(memnode, 1);
			
 
				 		_STARPU_TRACE_END_PROGRESS(memnode);
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -214,16 +214,15 @@ cl_int _starpu_opencl_deinit_context(int devid)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
			
 
				+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	STARPU_ABORT();
			
 
				 #else
			
 
				 	cl_int err;
			
 
				         cl_mem memory;
			
 
				-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				 
			
 
				-	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
			
 
				+	memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
			
 
				 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -234,7 +233,7 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t
 
				 	 */
			
 
				 	char dummy = 0;
			
 
				 	cl_event ev;
			
 
				-	err = clEnqueueWriteBuffer(alloc_queues[worker->devid], memory, CL_TRUE,
			
 
				+	err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
			
 
				 				   0, sizeof(dummy), &dummy,
			
 
				 				   0, NULL, &ev);
			
 
				 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
			
@@ -597,6 +596,11 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 
				 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
			
 
				 
			
 
				 	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
			
 
				+	if (worker->pipeline_length > STARPU_MAX_PIPELINE)
			
 
				+	{
			
 
				+		_STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
			
 
				+		worker->pipeline_length = STARPU_MAX_PIPELINE;
			
 
				+	}
			
 
				 
			
 
				 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
			
 
				 
			
@@ -741,7 +745,10 @@ void *_starpu_opencl_worker(void *_arg)
 
				 	_starpu_opencl_driver_init(worker);
			
 
				 	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 	while (_starpu_machine_is_running())
			
 
				+	{
			
 
				+		_starpu_may_pause();
			
 
				 		_starpu_opencl_driver_run_once(worker);
			
 
				+	}
			
 
				 	_starpu_opencl_driver_deinit(worker);
			
 
				 	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
@@ -91,6 +91,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 
				 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				 	
			
 
				 	struct starpu_sched_ctx_iterator it;
			
 
				+#ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	char dowake[STARPU_NMAXWORKERS] = { 0 };
			
 
				+#endif
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 	
			
@@ -112,20 +115,35 @@ static int push_task_eager_policy(struct starpu_task *task)
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 				starpu_bitmap_unset(data->waiters, worker);
			
 
				 				/* We really woke at least somebody, no need to wake somebody else */
			
 
				-				goto out;
			
 
				+				break;
			
 
				 #else
			
 
				-				starpu_pthread_mutex_t *sched_mutex;
			
 
				-				starpu_pthread_cond_t *sched_cond;
			
 
				-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				-
			
 
				-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
			
 
				-				    goto out; // wake up a single worker
			
 
				+				dowake[worker] = 1;
			
 
				 #endif
			
 
				 			}
			
 
				 	}
			
 
				-out:
			
 
				+	/* Let the task free */
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
			
 
				 
			
 
				+#ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	/* Now that we have a list of potential workers, try to wake one */
			
 
				+	if(workers->init_iterator)
			
 
				+		workers->init_iterator(workers, &it);
			
 
				+	
			
 
				+	while(workers->has_next(workers, &it))
			
 
				+	{
			
 
				+		worker = workers->get_next(workers, &it);
			
 
				+		if (dowake[worker])
			
 
				+		{
			
 
				+			starpu_pthread_mutex_t *sched_mutex;
			
 
				+			starpu_pthread_cond_t *sched_cond;
			
 
				+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				+
			
 
				+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
			
 
				+				break; // wake up a single worker
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -154,9 +172,11 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
				 	if (_starpu_fifo_empty(data->fifo))
			
 
				 		return NULL;
			
 
				 
			
 
				+#ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	if (starpu_bitmap_get(data->waiters, workerid))
			
 
				 		/* Nobody woke us, avoid bothering the mutex */
			
 
				 		return NULL;
			
 
				+#endif
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
			
 
				 
			
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
@@ -139,6 +139,9 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 
				 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				 	
			
 
				 	struct starpu_sched_ctx_iterator it;
			
 
				+#ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	char dowake[STARPU_NMAXWORKERS] = { 0 };
			
 
				+#endif
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 	
			
@@ -160,20 +163,35 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 				starpu_bitmap_unset(data->waiters, worker);
			
 
				 				/* We really woke at least somebody, no need to wake somebody else */
			
 
				-				goto out;
			
 
				+				break;
			
 
				 #else
			
 
				-				starpu_pthread_mutex_t *sched_mutex;
			
 
				-				starpu_pthread_cond_t *sched_cond;
			
 
				-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				-
			
 
				-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
			
 
				-				    goto out; // wake up a single worker
			
 
				+				dowake[worker] = 1;
			
 
				 #endif
			
 
				 			}
			
 
				 	}
			
 
				-out:
			
 
				+	/* Let the task free */
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
			
 
				 
			
 
				+#ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	/* Now that we have a list of potential workers, try to wake one */
			
 
				+	if(workers->init_iterator)
			
 
				+		workers->init_iterator(workers, &it);
			
 
				+	
			
 
				+	while(workers->has_next(workers, &it))
			
 
				+	{
			
 
				+		worker = workers->get_next(workers, &it);
			
 
				+		if (dowake[worker])
			
 
				+		{
			
 
				+			starpu_pthread_mutex_t *sched_mutex;
			
 
				+			starpu_pthread_cond_t *sched_cond;
			
 
				+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				+
			
 
				+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
			
 
				+				break; // wake up a single worker
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -194,9 +212,11 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
				 	if (taskq->total_ntasks == 0)
			
 
				 		return NULL;
			
 
				 
			
 
				+#ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	if (starpu_bitmap_get(data->waiters, workerid))
			
 
				 		/* Nobody woke us, avoid bothering the mutex */
			
 
				 		return NULL;
			
 
				+#endif
			
 
				 
			
 
				 	/* release this mutex before trying to wake up other workers */
			
 
				 	starpu_pthread_mutex_t *curr_sched_mutex;
			
--- a/src/sched_policies/locality_work_stealing_policy.c
+++ b/src/sched_policies/locality_work_stealing_policy.c
@@ -224,13 +224,12 @@ static int lws_push_task(struct starpu_task *task)
 
				 #ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				 	struct starpu_sched_ctx_iterator it;
			
 
				+	unsigned worker;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 	while(workers->has_next(workers, &it))
			
 
				 	{
			
 
				 		worker = workers->get_next(workers, &it);
			
 
				-		starpu_pthread_mutex_t *sched_mutex;
			
 
				-		starpu_pthread_cond_t *sched_cond;
			
 
				 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
			
 
				 	}
			
@@ -368,6 +367,6 @@ struct starpu_sched_policy _starpu_sched_lws_policy =
 
				 	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
 
				-	.policy_name = "nws",
			
 
				-	.policy_description = "new work stealing"
			
 
				+	.policy_name = "lws",
			
 
				+	.policy_description = "locality work stealing"
			
 
				 };
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -151,6 +151,7 @@ noinst_PROGRAMS =				\
 
				 	datawizard/acquire_cb_insert		\
			
 
				 	datawizard/acquire_release		\
			
 
				 	datawizard/acquire_release2		\
			
 
				+	datawizard/cache			\
			
 
				 	datawizard/commute			\
			
 
				 	datawizard/copy				\
			
 
				 	datawizard/data_implicit_deps		\
			
--- a/tests/datawizard/cache.c
+++ b/tests/datawizard/cache.c
@@ -0,0 +1,100 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				+static void codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+     FPRINTF(stderr, "%lx\n", (unsigned long) STARPU_VARIABLE_GET_PTR(descr[0]));
			
 
				+     FPRINTF(stderr, "codelet\n");
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static struct starpu_codelet cuda_cl =
			
 
				+{
			
 
				+     .cuda_funcs = {codelet, NULL},
			
 
				+     .nbuffers = 1,
			
 
				+     .modes = {STARPU_R}
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static struct starpu_codelet opencl_cl =
			
 
				+{
			
 
				+     .opencl_funcs = {codelet, NULL},
			
 
				+     .nbuffers = 1,
			
 
				+     .modes = {STARPU_R}
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+void dotest(struct starpu_codelet *cl)
			
 
				+{
			
 
				+     int ret;
			
 
				+     int var = 42;
			
 
				+     starpu_data_handle_t handle;
			
 
				+
			
 
				+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
			
 
				+
			
 
				+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
			
 
				+     if (ret == -ENODEV) goto enodev;
			
 
				+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+
			
 
				+     starpu_task_wait_for_all();
			
 
				+
			
 
				+     starpu_data_unregister(handle);
			
 
				+
			
 
				+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
			
 
				+
			
 
				+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
			
 
				+     if (ret == -ENODEV) goto enodev;
			
 
				+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+
			
 
				+     starpu_task_wait_for_all();
			
 
				+
			
 
				+enodev:
			
 
				+     starpu_data_unregister(handle);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+     int ret;
			
 
				+
			
 
				+     ret = starpu_init(NULL);
			
 
				+     if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+     dotest(&cuda_cl);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+     dotest(&opencl_cl);
			
 
				+#endif
			
 
				+
			
 
				+     starpu_shutdown();
			
 
				+
			
 
				+     return 0;
			
 
				+
			
 
				+enodev:
			
 
				+     starpu_shutdown();
			
 
				+     /* yes, we do not perform the computation but we did detect that no one
			
 
				+      * could perform the kernel, so this is not an error from StarPU */
			
 
				+     fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+     return STARPU_TEST_SKIPPED;
			
 
				+}
			
--- a/tests/datawizard/gpu_ptr_register.c
+++ b/tests/datawizard/gpu_ptr_register.c
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 
				 static int
			
 
				 find_a_worker(enum starpu_worker_archtype type)
			
 
				 {
			
 
				-	int worker;
			
 
				-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
			
 
				+	int worker[STARPU_NMAXWORKERS];
			
 
				+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
			
 
				 	if (ret == 0)
			
 
				 		return -ENODEV;
			
 
				-	return worker;
			
 
				+	if (ret == -ERANGE)
			
 
				+		return worker[STARPU_NMAXWORKERS-1];
			
 
				+	return worker[ret-1];
			
 
				 }
			
 
				 
			
 
				 static int
			
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#if CUDART_VERSION >= 4000
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 static int
			
 
				 test_cuda(void)
			
 
				 {
			
@@ -100,8 +102,7 @@ test_cuda(void)
 
				 	size = 10 * n;
			
 
				 
			
 
				 	devid = starpu_worker_get_devid(chosen);
			
 
				-	starpu_cuda_set_device(devid);
			
 
				-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
			
 
				+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
			
 
				 
			
 
				 	foo = calloc(size, sizeof(*foo));
			
 
				 	for (i = 0; i < size; i++)
			
@@ -180,9 +181,7 @@ test_opencl(void)
 
				 	starpu_opencl_get_context(devid, &context);
			
 
				 	starpu_opencl_get_queue(devid, &queue);
			
 
				 
			
 
				-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
			
 
				-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
			
 
				 
			
 
				 	unsigned int *foo = malloc(size*sizeof(*foo));
			
 
				 	for (i = 0; i < size; i++)
			
@@ -261,7 +260,7 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 	ret = test_cuda();
			
 
				 	if (ret == 1)
			
 
				 		goto fail;
			
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2012 inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 
				 static int
			
 
				 find_a_worker(enum starpu_worker_archtype type)
			
 
				 {
			
 
				-	int worker;
			
 
				-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
			
 
				+	int worker[STARPU_NMAXWORKERS];
			
 
				+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
			
 
				 	if (ret == 0)
			
 
				 		return -ENODEV;
			
 
				-	return worker;
			
 
				+	if (ret == -ERANGE)
			
 
				+		return worker[STARPU_NMAXWORKERS-1];
			
 
				+	return worker[ret-1];
			
 
				 }
			
 
				 
			
 
				 static int
			
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#if CUDART_VERSION >= 4000
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 static int
			
 
				 test_cuda(void)
			
 
				 {
			
@@ -100,8 +102,7 @@ test_cuda(void)
 
				 	size = 10 * n;
			
 
				 
			
 
				 	devid = starpu_worker_get_devid(chosen);
			
 
				-	starpu_cuda_set_device(devid);
			
 
				-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
			
 
				+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
			
 
				 
			
 
				 	foo = calloc(size, sizeof(*foo));
			
 
				 	for (i = 0; i < size; i++)
			
@@ -182,9 +183,7 @@ test_opencl(void)
 
				 	starpu_opencl_get_context(devid, &context);
			
 
				 	starpu_opencl_get_queue(devid, &queue);
			
 
				 
			
 
				-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
			
 
				-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
			
 
				 
			
 
				 	unsigned int *foo = malloc(size*sizeof(*foo));
			
 
				 	for (i = 0; i < size; i++)
			
@@ -269,7 +268,7 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 	ret = test_cuda();
			
 
				 	if (ret == 1)
			
 
				 		goto fail;
			
--- a/tests/sched_policies/simple_cpu_gpu_sched.c
+++ b/tests/sched_policies/simple_cpu_gpu_sched.c
@@ -205,7 +205,18 @@ run(struct starpu_sched_policy *policy)
 
				 	if (cpu_task_worker != STARPU_CPU_WORKER ||
			
 
				 			(gpu_task_worker != STARPU_CUDA_WORKER &&
			
 
				 			 gpu_task_worker != STARPU_OPENCL_WORKER))
			
 
				+	{
			
 
				+		if (cpu_task_worker != STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
			
 
				+		}
			
 
				+		if (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "The GPU task did not run on a Cuda or OpenCL worker\n");
			
 
				+		}
			
 
				+
			
 
				 		ret = 1;
			
 
				+	}
			
 
				 	else
			
 
				 		ret = 0;