Browse Source

merge trunk

Nathalie Furmento 10 years ago
parent
commit
f56f454c4d

+ 7 - 5
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -119,8 +119,9 @@ $ vite paje.trace
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
 Details of the codelet execution can be obtained by passing
-<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
-(at least r1430).
+\ref enable-paje-codelet-details "--enable-paje-codelet-details" when
+configuring StarPU and using a recent enough version of ViTE (at least
+r1430).
 
 In the MPI execution case, collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
@@ -133,9 +134,10 @@ By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 
 To identify tasks precisely, the application can set the ::tag_id field of the
-tasks (or use STARPU_TAG_ONY when using starpu_task_insert), and with a recent
-enough version of vite (>= r1430) and the <c>--enable-paje-codelet-details</c>
-configure option, the value of the tag will show up in the trace.
+tasks (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
+enough version of vite (>= r1430) and the
+\ref enable-paje-codelet-details "--enable-paje-codelet-details"
+StarPU configure option, the value of the tag will show up in the trace.
 
 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
 

+ 2 - 1
examples/basic_examples/vector_scal_c.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 1,
 	.model = &vector_scal_model

+ 1 - 1
examples/filters/custom_mf/custom_interface.c

@@ -425,7 +425,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 	size = src_custom->nx * 2 * sizeof(float);
 	if (dst_custom->cpu_ptr == NULL)
 	{
-		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
+		ret = starpu_opencl_allocate_memory(devid, (cl_mem*)&dst_custom->cpu_ptr,
 				size, CL_MEM_READ_WRITE);
 		assert(ret == CL_SUCCESS);
 	}

+ 2 - 2
include/starpu_opencl.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -76,7 +76,7 @@ int starpu_opencl_collect_stats(cl_event event);
 
 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
-cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *addr, size_t size, cl_mem_flags flags);
 
 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 

+ 17 - 17
mpi/src/starpu_mpi.c

@@ -173,12 +173,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 
-	 TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
 
 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
 
-	 TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
 
 	 /* somebody is perhaps waiting for the MPI request to be posted */
 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -304,12 +304,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
-	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
 
 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
 
-	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -413,12 +413,12 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	/* Which is the mpi request we are waiting for ? */
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 
-	TRACE_MPI_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
 
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
 
-	TRACE_MPI_UWAIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_END(req->srcdst, req->mpi_tag);
 
 	_starpu_mpi_handle_request_termination(req);
 	_STARPU_MPI_LOG_OUT();
@@ -481,12 +481,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
-	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
 
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
 
-	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_END(req->srcdst, req->mpi_tag);
 
 	if (*testing_req->flag)
 	{
@@ -933,22 +933,22 @@ static void _starpu_mpi_test_detached_requests(void)
 		{
 			if (req->request_type == RECV_REQ)
 			{
-				TRACE_MPI_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
 			}
 			else if (req->request_type == SEND_REQ)
 			{
-				TRACE_MPI_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
 			}
 
 			_starpu_mpi_handle_request_termination(req);
 
 			if (req->request_type == RECV_REQ)
 			{
-				TRACE_MPI_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
 			}
 			else if (req->request_type == SEND_REQ)
 			{
-				TRACE_MPI_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
 			}
 		}
 
@@ -1057,7 +1057,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
 
 	{
-		TRACE_MPI_START(rank, worldsize);
+		_STARPU_MPI_TRACE_START(rank, worldsize);
 #ifdef STARPU_USE_FXT
 		starpu_profiling_set_id(rank);
 #endif //STARPU_USE_FXT
@@ -1097,14 +1097,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		{
 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
 
-			TRACE_MPI_SLEEP_BEGIN();
+			_STARPU_MPI_TRACE_SLEEP_BEGIN();
 
 			if (barrier_running)
 				/* Tell mpi_barrier */
 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 
-			TRACE_MPI_SLEEP_END();
+			_STARPU_MPI_TRACE_SLEEP_END();
 		}
 
 		/* get one request */
@@ -1323,7 +1323,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
 
-	TRACE_MPI_BARRIER(rank, worldsize, random_number);
+	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
 #endif
@@ -1407,7 +1407,7 @@ int starpu_mpi_shutdown(void)
 	starpu_progression_hook_deregister(hookid);
 #endif /* STARPU_MPI_ACTIVITY */
 
-	TRACE_MPI_STOP(rank, world_size);
+	_STARPU_MPI_TRACE_STOP(rank, world_size);
 
 	/* free the request queues */
 	_starpu_mpi_req_list_delete(detached_requests);

+ 76 - 76
mpi/src/starpu_mpi_fxt.h

@@ -26,86 +26,86 @@
 extern "C" {
 #endif
 
-#define FUT_MPI_START				0x5201
-#define FUT_MPI_STOP				0x5202
-#define FUT_MPI_BARRIER				0x5203
-#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
-#define FUT_MPI_ISEND_SUBMIT_END		0x5205
-#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
-#define FUT_MPI_IRECV_SUBMIT_END		0x5207
-#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
-#define FUT_MPI_ISEND_COMPLETE_END		0x5209
-#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
-#define FUT_MPI_IRECV_COMPLETE_END		0x5211
-#define FUT_MPI_SLEEP_BEGIN			0x5212
-#define FUT_MPI_SLEEP_END			0x5213
-#define FUT_MPI_DTESTING_BEGIN			0x5214
-#define FUT_MPI_DTESTING_END			0x5215
-#define FUT_MPI_UTESTING_BEGIN			0x5216
-#define FUT_MPI_UTESTING_END			0x5217
-#define FUT_MPI_UWAIT_BEGIN			0x5218
-#define FUT_MPI_UWAIT_END			0x5219
+#define _STARPU_MPI_FUT_START				0x5201
+#define _STARPU_MPI_FUT_STOP				0x5202
+#define _STARPU_MPI_FUT_BARRIER				0x5203
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN		0x5204
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_END		0x5205
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN		0x5206
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_END		0x5207
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN		0x5208
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_END		0x5209
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN		0x5210
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_END		0x5211
+#define _STARPU_MPI_FUT_SLEEP_BEGIN			0x5212
+#define _STARPU_MPI_FUT_SLEEP_END			0x5213
+#define _STARPU_MPI_FUT_DTESTING_BEGIN			0x5214
+#define _STARPU_MPI_FUT_DTESTING_END			0x5215
+#define _STARPU_MPI_FUT_UTESTING_BEGIN			0x5216
+#define _STARPU_MPI_FUT_UTESTING_END			0x5217
+#define _STARPU_MPI_FUT_UWAIT_BEGIN			0x5218
+#define _STARPU_MPI_FUT_UWAIT_END			0x5219
 
 #ifdef STARPU_USE_FXT
-#define TRACE_MPI_START(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_STOP(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
-	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_SLEEP_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
-#define TRACE_MPI_SLEEP_END()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
-#define TRACE_MPI_DTESTING_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
-#define TRACE_MPI_DTESTING_END()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
-#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_START(rank, worldsize)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
+#define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
+#define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
+#define _STARPU_MPI_TRACE_SLEEP_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
+#define _STARPU_MPI_TRACE_DTESTING_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define _STARPU_MPI_TRACE_UTESTING_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
 #define TRACE
 #else
-#define TRACE_MPI_START(a, b)				do {} while(0);
-#define TRACE_MPI_STOP(a, b)				do {} while(0);
-#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
-#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
-#define TRACE_MPI_SLEEP_END()				do {} while(0);
-#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
-#define TRACE_MPI_DTESTING_END()			do {} while(0);
-#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()				do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_END()				do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()			do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_END()			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
 #endif
 
 #ifdef __cplusplus

+ 9 - 0
src/core/simgrid.c

@@ -253,6 +253,10 @@ void _starpu_simgrid_init()
 	xbt_dynar_free(&hosts);
 }
 
+/*
+ * Tasks
+ */
+
 /* Task execution submitted by StarPU */
 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
 {
@@ -276,8 +280,13 @@ void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_
 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
 			0, NULL);
 	MSG_task_execute(simgrid_task);
+	MSG_task_destroy(simgrid_task);
 }
 
+/*
+ * Transfers
+ */
+
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 LIST_TYPE(transfer,
 	msg_task_t task;

+ 1 - 1
src/datawizard/coherency.c

@@ -261,7 +261,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 	{
 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-		if (!copy_methods->cuda_to_cuda_async)
+		if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
 			return 0;
 	}
 #endif

+ 18 - 2
src/datawizard/malloc.c

@@ -50,7 +50,7 @@ struct malloc_pinned_codelet_struct
 //{
 //	struct malloc_pinned_codelet_struct *s = arg;
 //        //        *(s->ptr) = malloc(s->dim);
-//        starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
+//        starpu_opencl_allocate_memory(devid, (void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
 //}
 //#endif
 
@@ -404,6 +404,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 			STARPU_ASSERT(last[dst_node] >= addr);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			status = cudaMalloc((void **)&addr, size);
 			if (!addr || (status != cudaSuccess))
 			{
@@ -433,7 +441,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
                                 int ret;
 				cl_mem ptr;
 
-				ret = starpu_opencl_allocate_memory(&ptr, size, CL_MEM_READ_WRITE);
+				ret = starpu_opencl_allocate_memory(_starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
 				if (ret)
 				{
 					addr = 0;
@@ -505,6 +513,14 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 			cudaError_t err;
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			err = cudaFree((void*)addr);
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(err);

+ 10 - 5
src/datawizard/memalloc.c

@@ -440,19 +440,24 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
 	if (!old_replicate)
 	{
+		/* Free the copy that we made */
 		free(mc->chunk_interface);
 		mc->chunk_interface = NULL;
 	}
 
-	mc->data = new_replicate->handle;
-	/* mc->ops, mc->footprint and mc->interface should be
+	/* XXX: We do not actually reuse the mc at the moment, only the interface */
+
+	/* mc->data = new_replicate->handle; */
+	/* mc->footprint, mc->ops, mc->size_interface, mc->automatically_allocated should be
  	 * unchanged ! */
 
-	/* reinsert the mem chunk in the list of active memory chunks */
-	if (!is_already_in_mc_list)
+	/* remove the mem chunk from the list of active memory chunks, register_mem_chunk will put it back later */
+	if (is_already_in_mc_list)
 	{
-		_starpu_mem_chunk_list_push_back(mc_list[node], mc);
+		_starpu_mem_chunk_list_erase(mc_list[node], mc);
 	}
+
+	free(mc);
 }
 
 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)

+ 19 - 19
src/debug/traces/starpu_fxt.c

@@ -1891,79 +1891,79 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				handle_user_event(&ev, options);
 				break;
 
-			case FUT_MPI_START:
+			case _STARPU_MPI_FUT_START:
 				handle_mpi_start(&ev, options);
 				break;
 
-			case FUT_MPI_STOP:
+			case _STARPU_MPI_FUT_STOP:
 				handle_mpi_stop(&ev, options);
 				break;
 
-			case FUT_MPI_BARRIER:
+			case _STARPU_MPI_FUT_BARRIER:
 				handle_mpi_barrier(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN:
 				handle_mpi_isend_submit_begin(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_SUBMIT_END:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_END:
 				handle_mpi_isend_submit_end(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN:
 				handle_mpi_irecv_submit_begin(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_SUBMIT_END:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_END:
 				handle_mpi_irecv_submit_end(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN:
 				handle_mpi_isend_complete_begin(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_COMPLETE_END:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_END:
 				handle_mpi_isend_complete_end(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN:
 				handle_mpi_irecv_complete_begin(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_COMPLETE_END:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_END:
 				handle_mpi_irecv_complete_end(&ev, options);
 				break;
 
-			case FUT_MPI_SLEEP_BEGIN:
+			case _STARPU_MPI_FUT_SLEEP_BEGIN:
 				handle_mpi_sleep_begin(&ev, options);
 				break;
 
-			case FUT_MPI_SLEEP_END:
+			case _STARPU_MPI_FUT_SLEEP_END:
 				handle_mpi_sleep_end(&ev, options);
 				break;
 
-			case FUT_MPI_DTESTING_BEGIN:
+			case _STARPU_MPI_FUT_DTESTING_BEGIN:
 				handle_mpi_dtesting_begin(&ev, options);
 				break;
 
-			case FUT_MPI_DTESTING_END:
+			case _STARPU_MPI_FUT_DTESTING_END:
 				handle_mpi_dtesting_end(&ev, options);
 				break;
 
-			case FUT_MPI_UTESTING_BEGIN:
+			case _STARPU_MPI_FUT_UTESTING_BEGIN:
 				handle_mpi_utesting_begin(&ev, options);
 				break;
 
-			case FUT_MPI_UTESTING_END:
+			case _STARPU_MPI_FUT_UTESTING_END:
 				handle_mpi_utesting_end(&ev, options);
 				break;
 
-			case FUT_MPI_UWAIT_BEGIN:
+			case _STARPU_MPI_FUT_UWAIT_BEGIN:
 				handle_mpi_uwait_begin(&ev, options);
 				break;
 
-			case FUT_MPI_UWAIT_END:
+			case _STARPU_MPI_FUT_UWAIT_END:
 				handle_mpi_uwait_end(&ev, options);
 				break;
 

+ 1 - 1
src/debug/traces/starpu_fxt_mpi.c

@@ -74,7 +74,7 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 			break;
 		}
 
-		if (ev.code == FUT_MPI_BARRIER)
+		if (ev.code == _STARPU_MPI_FUT_BARRIER)
 		{
 			/* We found the sync point */
 			*offset = ev.time;

+ 3 - 4
src/drivers/opencl/driver_opencl.c

@@ -214,16 +214,15 @@ cl_int _starpu_opencl_deinit_context(int devid)
 }
 #endif
 
-cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_SIMGRID
 	STARPU_ABORT();
 #else
 	cl_int err;
         cl_mem memory;
-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
-	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -234,7 +233,7 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t
 	 */
 	char dummy = 0;
 	cl_event ev;
-	err = clEnqueueWriteBuffer(alloc_queues[worker->devid], memory, CL_TRUE,
+	err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
 				   0, sizeof(dummy), &dummy,
 				   0, NULL, &ev);
 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)

+ 1 - 2
src/sched_policies/locality_work_stealing_policy.c

@@ -224,13 +224,12 @@ static int lws_push_task(struct starpu_task *task)
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_sched_ctx_iterator it;
+	unsigned worker;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 	while(workers->has_next(workers, &it))
 	{
 		worker = workers->get_next(workers, &it);
-		starpu_pthread_mutex_t *sched_mutex;
-		starpu_pthread_cond_t *sched_cond;
 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
 	}

+ 1 - 0
tests/Makefile.am

@@ -151,6 +151,7 @@ noinst_PROGRAMS =				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
+	datawizard/cache			\
 	datawizard/commute			\
 	datawizard/copy				\
 	datawizard/data_implicit_deps		\

+ 100 - 0
tests/datawizard/cache.c

@@ -0,0 +1,100 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+static void codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+     FPRINTF(stderr, "%lx\n", (unsigned long) STARPU_VARIABLE_GET_PTR(descr[0]));
+     FPRINTF(stderr, "codelet\n");
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+static struct starpu_codelet cuda_cl =
+{
+     .cuda_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static struct starpu_codelet opencl_cl =
+{
+     .opencl_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+void dotest(struct starpu_codelet *cl)
+{
+     int ret;
+     int var = 42;
+     starpu_data_handle_t handle;
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+     starpu_data_unregister(handle);
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+enodev:
+     starpu_data_unregister(handle);
+}
+
+int main(int argc, char **argv)
+{
+     int ret;
+
+     ret = starpu_init(NULL);
+     if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+     dotest(&cuda_cl);
+#endif
+#ifdef STARPU_USE_OPENCL
+     dotest(&opencl_cl);
+#endif
+
+     starpu_shutdown();
+
+     return 0;
+
+enodev:
+     starpu_shutdown();
+     /* yes, we do not perform the computation but we did detect that no one
+      * could perform the kernel, so this is not an error from StarPU */
+     fprintf(stderr, "WARNING: No one can execute this task\n");
+     return STARPU_TEST_SKIPPED;
+}

+ 9 - 10
tests/datawizard/gpu_ptr_register.c

@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 find_a_worker(enum starpu_worker_archtype type)
 {
-	int worker;
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int worker[STARPU_NMAXWORKERS];
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 test_cuda(void)
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
 
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -180,9 +181,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -261,7 +260,7 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	if (ret == 1)
 		goto fail;

+ 10 - 11
tests/datawizard/gpu_register.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 find_a_worker(enum starpu_worker_archtype type)
 {
-	int worker;
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int worker[STARPU_NMAXWORKERS];
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 test_cuda(void)
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
 
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -182,9 +183,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -269,7 +268,7 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	if (ret == 1)
 		goto fail;

+ 10 - 0
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -200,6 +200,16 @@ run(struct starpu_sched_policy *policy)
 	if (cpu_task_worker != STARPU_CPU_WORKER || (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER))
 	{
 		FPRINTF(stderr, "Tasks did not execute on expected worker\n");
+	{
+		if (cpu_task_worker != STARPU_CPU_WORKER)
+		{
+			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
+		}
+		if (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER)
+		{
+			FPRINTF(stderr, "The GPU task did not run on a Cuda or OpenCL worker\n");
+		}
+
 		ret = 1;
 	}
 	else