Browse Source

merge trunk

Nathalie Furmento 10 years ago
parent
commit
f56f454c4d

+ 7 - 5
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -119,8 +119,9 @@ $ vite paje.trace
 To get names of tasks instead of "unknown", fill the optional
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
 starpu_codelet::name, or use a performance model for them.
 Details of the codelet execution can be obtained by passing
 Details of the codelet execution can be obtained by passing
-<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
+\ref enable-paje-codelet-details "--enable-paje-codelet-details" when
-(at least r1430).
+configuring StarPU and using a recent enough version of ViTE (at least
+r1430).
 
 
 In the MPI execution case, collect the trace files from the MPI nodes, and
 In the MPI execution case, collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
@@ -133,9 +134,10 @@ By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 
 
 To identify tasks precisely, the application can set the ::tag_id field of the
 To identify tasks precisely, the application can set the ::tag_id field of the
-tasks (or use STARPU_TAG_ONY when using starpu_task_insert), and with a recent
+tasks (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
-enough version of vite (>= r1430) and the <c>--enable-paje-codelet-details</c>
+enough version of vite (>= r1430) and the
-configure option, the value of the tag will show up in the trace.
+\ref enable-paje-codelet-details "--enable-paje-codelet-details"
+StarPU configure option, the value of the tag will show up in the trace.
 
 
 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
 
 

+ 2 - 1
examples/basic_examples/vector_scal_c.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013-2014  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
 	.cuda_funcs = {scal_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model = &vector_scal_model
 	.model = &vector_scal_model

+ 1 - 1
examples/filters/custom_mf/custom_interface.c

@@ -425,7 +425,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 	size = src_custom->nx * 2 * sizeof(float);
 	size = src_custom->nx * 2 * sizeof(float);
 	if (dst_custom->cpu_ptr == NULL)
 	if (dst_custom->cpu_ptr == NULL)
 	{
 	{
-		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
+		ret = starpu_opencl_allocate_memory(devid, (cl_mem*)&dst_custom->cpu_ptr,
 				size, CL_MEM_READ_WRITE);
 				size, CL_MEM_READ_WRITE);
 		assert(ret == CL_SUCCESS);
 		assert(ret == CL_SUCCESS);
 	}
 	}

+ 2 - 2
include/starpu_opencl.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -76,7 +76,7 @@ int starpu_opencl_collect_stats(cl_event event);
 
 
 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
 
-cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *addr, size_t size, cl_mem_flags flags);
 
 
 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 
 

+ 17 - 17
mpi/src/starpu_mpi.c

@@ -173,12 +173,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
 
 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 
 
-	 TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
 
 
 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
 
 
-	 TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
 
 
 	 /* somebody is perhaps waiting for the MPI request to be posted */
 	 /* somebody is perhaps waiting for the MPI request to be posted */
 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -304,12 +304,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 
 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
 
-	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
 
 
 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
 
 
-	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
 
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -413,12 +413,12 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	/* Which is the mpi request we are waiting for ? */
 	/* Which is the mpi request we are waiting for ? */
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 
 
-	TRACE_MPI_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
 
 
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
 
 
-	TRACE_MPI_UWAIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_END(req->srcdst, req->mpi_tag);
 
 
 	_starpu_mpi_handle_request_termination(req);
 	_starpu_mpi_handle_request_termination(req);
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
@@ -481,12 +481,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
 
-	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
 
 
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
 
 
-	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_END(req->srcdst, req->mpi_tag);
 
 
 	if (*testing_req->flag)
 	if (*testing_req->flag)
 	{
 	{
@@ -933,22 +933,22 @@ static void _starpu_mpi_test_detached_requests(void)
 		{
 		{
 			if (req->request_type == RECV_REQ)
 			if (req->request_type == RECV_REQ)
 			{
 			{
-				TRACE_MPI_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
 			}
 			}
 			else if (req->request_type == SEND_REQ)
 			else if (req->request_type == SEND_REQ)
 			{
 			{
-				TRACE_MPI_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
 			}
 			}
 
 
 			_starpu_mpi_handle_request_termination(req);
 			_starpu_mpi_handle_request_termination(req);
 
 
 			if (req->request_type == RECV_REQ)
 			if (req->request_type == RECV_REQ)
 			{
 			{
-				TRACE_MPI_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
 			}
 			}
 			else if (req->request_type == SEND_REQ)
 			else if (req->request_type == SEND_REQ)
 			{
 			{
-				TRACE_MPI_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
 			}
 			}
 		}
 		}
 
 
@@ -1057,7 +1057,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
 
 
 	{
 	{
-		TRACE_MPI_START(rank, worldsize);
+		_STARPU_MPI_TRACE_START(rank, worldsize);
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 		starpu_profiling_set_id(rank);
 		starpu_profiling_set_id(rank);
 #endif //STARPU_USE_FXT
 #endif //STARPU_USE_FXT
@@ -1097,14 +1097,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		{
 		{
 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
 
 
-			TRACE_MPI_SLEEP_BEGIN();
+			_STARPU_MPI_TRACE_SLEEP_BEGIN();
 
 
 			if (barrier_running)
 			if (barrier_running)
 				/* Tell mpi_barrier */
 				/* Tell mpi_barrier */
 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 
 
-			TRACE_MPI_SLEEP_END();
+			_STARPU_MPI_TRACE_SLEEP_END();
 		}
 		}
 
 
 		/* get one request */
 		/* get one request */
@@ -1323,7 +1323,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
 
 
-	TRACE_MPI_BARRIER(rank, worldsize, random_number);
+	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 
 
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
 #endif
 #endif
@@ -1407,7 +1407,7 @@ int starpu_mpi_shutdown(void)
 	starpu_progression_hook_deregister(hookid);
 	starpu_progression_hook_deregister(hookid);
 #endif /* STARPU_MPI_ACTIVITY */
 #endif /* STARPU_MPI_ACTIVITY */
 
 
-	TRACE_MPI_STOP(rank, world_size);
+	_STARPU_MPI_TRACE_STOP(rank, world_size);
 
 
 	/* free the request queues */
 	/* free the request queues */
 	_starpu_mpi_req_list_delete(detached_requests);
 	_starpu_mpi_req_list_delete(detached_requests);

+ 76 - 76
mpi/src/starpu_mpi_fxt.h

@@ -26,86 +26,86 @@
 extern "C" {
 extern "C" {
 #endif
 #endif
 
 
-#define FUT_MPI_START				0x5201
+#define _STARPU_MPI_FUT_START				0x5201
-#define FUT_MPI_STOP				0x5202
+#define _STARPU_MPI_FUT_STOP				0x5202
-#define FUT_MPI_BARRIER				0x5203
+#define _STARPU_MPI_FUT_BARRIER				0x5203
-#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN		0x5204
-#define FUT_MPI_ISEND_SUBMIT_END		0x5205
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_END		0x5205
-#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN		0x5206
-#define FUT_MPI_IRECV_SUBMIT_END		0x5207
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_END		0x5207
-#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN		0x5208
-#define FUT_MPI_ISEND_COMPLETE_END		0x5209
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_END		0x5209
-#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN		0x5210
-#define FUT_MPI_IRECV_COMPLETE_END		0x5211
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_END		0x5211
-#define FUT_MPI_SLEEP_BEGIN			0x5212
+#define _STARPU_MPI_FUT_SLEEP_BEGIN			0x5212
-#define FUT_MPI_SLEEP_END			0x5213
+#define _STARPU_MPI_FUT_SLEEP_END			0x5213
-#define FUT_MPI_DTESTING_BEGIN			0x5214
+#define _STARPU_MPI_FUT_DTESTING_BEGIN			0x5214
-#define FUT_MPI_DTESTING_END			0x5215
+#define _STARPU_MPI_FUT_DTESTING_END			0x5215
-#define FUT_MPI_UTESTING_BEGIN			0x5216
+#define _STARPU_MPI_FUT_UTESTING_BEGIN			0x5216
-#define FUT_MPI_UTESTING_END			0x5217
+#define _STARPU_MPI_FUT_UTESTING_END			0x5217
-#define FUT_MPI_UWAIT_BEGIN			0x5218
+#define _STARPU_MPI_FUT_UWAIT_BEGIN			0x5218
-#define FUT_MPI_UWAIT_END			0x5219
+#define _STARPU_MPI_FUT_UWAIT_END			0x5219
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-#define TRACE_MPI_START(rank, worldsize)	\
+#define _STARPU_MPI_TRACE_START(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_STOP(rank, worldsize)	\
+#define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
+#define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	\
-	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_SLEEP_BEGIN()	\
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
-#define TRACE_MPI_SLEEP_END()	\
+#define _STARPU_MPI_TRACE_SLEEP_END()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
-#define TRACE_MPI_DTESTING_BEGIN()	\
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
-#define TRACE_MPI_DTESTING_END()	\
+#define _STARPU_MPI_TRACE_DTESTING_END()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
-#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UTESTING_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
 #define TRACE
 #define TRACE
 #else
 #else
-#define TRACE_MPI_START(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
-#define TRACE_MPI_STOP(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
-#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
+#define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
-#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()				do {} while(0);
-#define TRACE_MPI_SLEEP_END()				do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_END()				do {} while(0);
-#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()			do {} while(0);
-#define TRACE_MPI_DTESTING_END()			do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_END()			do {} while(0);
-#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
 #endif
 #endif
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 9 - 0
src/core/simgrid.c

@@ -253,6 +253,10 @@ void _starpu_simgrid_init()
 	xbt_dynar_free(&hosts);
 	xbt_dynar_free(&hosts);
 }
 }
 
 
+/*
+ * Tasks
+ */
+
 /* Task execution submitted by StarPU */
 /* Task execution submitted by StarPU */
 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
 {
 {
@@ -276,8 +280,13 @@ void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_
 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
 			0, NULL);
 			0, NULL);
 	MSG_task_execute(simgrid_task);
 	MSG_task_execute(simgrid_task);
+	MSG_task_destroy(simgrid_task);
 }
 }
 
 
+/*
+ * Transfers
+ */
+
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 LIST_TYPE(transfer,
 LIST_TYPE(transfer,
 	msg_task_t task;
 	msg_task_t task;

+ 1 - 1
src/datawizard/coherency.c

@@ -261,7 +261,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 	{
 	{
 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-		if (!copy_methods->cuda_to_cuda_async)
+		if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
 			return 0;
 			return 0;
 	}
 	}
 #endif
 #endif

+ 18 - 2
src/datawizard/malloc.c

@@ -50,7 +50,7 @@ struct malloc_pinned_codelet_struct
 //{
 //{
 //	struct malloc_pinned_codelet_struct *s = arg;
 //	struct malloc_pinned_codelet_struct *s = arg;
 //        //        *(s->ptr) = malloc(s->dim);
 //        //        *(s->ptr) = malloc(s->dim);
-//        starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
+//        starpu_opencl_allocate_memory(devid, (void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
 //}
 //}
 //#endif
 //#endif
 
 
@@ -404,6 +404,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 			STARPU_ASSERT(last[dst_node] >= addr);
 			STARPU_ASSERT(last[dst_node] >= addr);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 #else
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			status = cudaMalloc((void **)&addr, size);
 			status = cudaMalloc((void **)&addr, size);
 			if (!addr || (status != cudaSuccess))
 			if (!addr || (status != cudaSuccess))
 			{
 			{
@@ -433,7 +441,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
                                 int ret;
                                 int ret;
 				cl_mem ptr;
 				cl_mem ptr;
 
 
-				ret = starpu_opencl_allocate_memory(&ptr, size, CL_MEM_READ_WRITE);
+				ret = starpu_opencl_allocate_memory(_starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
 				if (ret)
 				if (ret)
 				{
 				{
 					addr = 0;
 					addr = 0;
@@ -505,6 +513,14 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 #else
 			cudaError_t err;
 			cudaError_t err;
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			err = cudaFree((void*)addr);
 			err = cudaFree((void*)addr);
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(err);
 				STARPU_CUDA_REPORT_ERROR(err);

+ 10 - 5
src/datawizard/memalloc.c

@@ -440,19 +440,24 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
 
 	if (!old_replicate)
 	if (!old_replicate)
 	{
 	{
+		/* Free the copy that we made */
 		free(mc->chunk_interface);
 		free(mc->chunk_interface);
 		mc->chunk_interface = NULL;
 		mc->chunk_interface = NULL;
 	}
 	}
 
 
-	mc->data = new_replicate->handle;
+	/* XXX: We do not actually reuse the mc at the moment, only the interface */
-	/* mc->ops, mc->footprint and mc->interface should be
+
+	/* mc->data = new_replicate->handle; */
+	/* mc->footprint, mc->ops, mc->size_interface, mc->automatically_allocated should be
  	 * unchanged ! */
  	 * unchanged ! */
 
 
-	/* reinsert the mem chunk in the list of active memory chunks */
+	/* remove the mem chunk from the list of active memory chunks, register_mem_chunk will put it back later */
-	if (!is_already_in_mc_list)
+	if (is_already_in_mc_list)
 	{
 	{
-		_starpu_mem_chunk_list_push_back(mc_list[node], mc);
+		_starpu_mem_chunk_list_erase(mc_list[node], mc);
 	}
 	}
+
+	free(mc);
 }
 }
 
 
 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)

+ 19 - 19
src/debug/traces/starpu_fxt.c

@@ -1891,79 +1891,79 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				handle_user_event(&ev, options);
 				handle_user_event(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_START:
+			case _STARPU_MPI_FUT_START:
 				handle_mpi_start(&ev, options);
 				handle_mpi_start(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_STOP:
+			case _STARPU_MPI_FUT_STOP:
 				handle_mpi_stop(&ev, options);
 				handle_mpi_stop(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_BARRIER:
+			case _STARPU_MPI_FUT_BARRIER:
 				handle_mpi_barrier(&ev, options);
 				handle_mpi_barrier(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN:
 				handle_mpi_isend_submit_begin(&ev, options);
 				handle_mpi_isend_submit_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_SUBMIT_END:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_END:
 				handle_mpi_isend_submit_end(&ev, options);
 				handle_mpi_isend_submit_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN:
 				handle_mpi_irecv_submit_begin(&ev, options);
 				handle_mpi_irecv_submit_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_SUBMIT_END:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_END:
 				handle_mpi_irecv_submit_end(&ev, options);
 				handle_mpi_irecv_submit_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN:
 				handle_mpi_isend_complete_begin(&ev, options);
 				handle_mpi_isend_complete_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_COMPLETE_END:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_END:
 				handle_mpi_isend_complete_end(&ev, options);
 				handle_mpi_isend_complete_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN:
 				handle_mpi_irecv_complete_begin(&ev, options);
 				handle_mpi_irecv_complete_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_COMPLETE_END:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_END:
 				handle_mpi_irecv_complete_end(&ev, options);
 				handle_mpi_irecv_complete_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_SLEEP_BEGIN:
+			case _STARPU_MPI_FUT_SLEEP_BEGIN:
 				handle_mpi_sleep_begin(&ev, options);
 				handle_mpi_sleep_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_SLEEP_END:
+			case _STARPU_MPI_FUT_SLEEP_END:
 				handle_mpi_sleep_end(&ev, options);
 				handle_mpi_sleep_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_DTESTING_BEGIN:
+			case _STARPU_MPI_FUT_DTESTING_BEGIN:
 				handle_mpi_dtesting_begin(&ev, options);
 				handle_mpi_dtesting_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_DTESTING_END:
+			case _STARPU_MPI_FUT_DTESTING_END:
 				handle_mpi_dtesting_end(&ev, options);
 				handle_mpi_dtesting_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UTESTING_BEGIN:
+			case _STARPU_MPI_FUT_UTESTING_BEGIN:
 				handle_mpi_utesting_begin(&ev, options);
 				handle_mpi_utesting_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UTESTING_END:
+			case _STARPU_MPI_FUT_UTESTING_END:
 				handle_mpi_utesting_end(&ev, options);
 				handle_mpi_utesting_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UWAIT_BEGIN:
+			case _STARPU_MPI_FUT_UWAIT_BEGIN:
 				handle_mpi_uwait_begin(&ev, options);
 				handle_mpi_uwait_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UWAIT_END:
+			case _STARPU_MPI_FUT_UWAIT_END:
 				handle_mpi_uwait_end(&ev, options);
 				handle_mpi_uwait_end(&ev, options);
 				break;
 				break;
 
 

+ 1 - 1
src/debug/traces/starpu_fxt_mpi.c

@@ -74,7 +74,7 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 			break;
 			break;
 		}
 		}
 
 
-		if (ev.code == FUT_MPI_BARRIER)
+		if (ev.code == _STARPU_MPI_FUT_BARRIER)
 		{
 		{
 			/* We found the sync point */
 			/* We found the sync point */
 			*offset = ev.time;
 			*offset = ev.time;

+ 3 - 4
src/drivers/opencl/driver_opencl.c

@@ -214,16 +214,15 @@ cl_int _starpu_opencl_deinit_context(int devid)
 }
 }
 #endif
 #endif
 
 
-cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
 {
 {
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	STARPU_ABORT();
 	STARPU_ABORT();
 #else
 #else
 	cl_int err;
 	cl_int err;
         cl_mem memory;
         cl_mem memory;
-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
 
-	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
@@ -234,7 +233,7 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t
 	 */
 	 */
 	char dummy = 0;
 	char dummy = 0;
 	cl_event ev;
 	cl_event ev;
-	err = clEnqueueWriteBuffer(alloc_queues[worker->devid], memory, CL_TRUE,
+	err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
 				   0, sizeof(dummy), &dummy,
 				   0, sizeof(dummy), &dummy,
 				   0, NULL, &ev);
 				   0, NULL, &ev);
 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)

+ 1 - 2
src/sched_policies/locality_work_stealing_policy.c

@@ -224,13 +224,12 @@ static int lws_push_task(struct starpu_task *task)
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_sched_ctx_iterator it;
 	struct starpu_sched_ctx_iterator it;
+	unsigned worker;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 	while(workers->has_next(workers, &it))
 	while(workers->has_next(workers, &it))
 	{
 	{
 		worker = workers->get_next(workers, &it);
 		worker = workers->get_next(workers, &it);
-		starpu_pthread_mutex_t *sched_mutex;
-		starpu_pthread_cond_t *sched_cond;
 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
 	}
 	}

+ 1 - 0
tests/Makefile.am

@@ -151,6 +151,7 @@ noinst_PROGRAMS =				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
 	datawizard/acquire_release2		\
+	datawizard/cache			\
 	datawizard/commute			\
 	datawizard/commute			\
 	datawizard/copy				\
 	datawizard/copy				\
 	datawizard/data_implicit_deps		\
 	datawizard/data_implicit_deps		\

+ 100 - 0
tests/datawizard/cache.c

@@ -0,0 +1,100 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+static void codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+     FPRINTF(stderr, "%lx\n", (unsigned long) STARPU_VARIABLE_GET_PTR(descr[0]));
+     FPRINTF(stderr, "codelet\n");
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+static struct starpu_codelet cuda_cl =
+{
+     .cuda_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static struct starpu_codelet opencl_cl =
+{
+     .opencl_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+void dotest(struct starpu_codelet *cl)
+{
+     int ret;
+     int var = 42;
+     starpu_data_handle_t handle;
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+     starpu_data_unregister(handle);
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+enodev:
+     starpu_data_unregister(handle);
+}
+
+int main(int argc, char **argv)
+{
+     int ret;
+
+     ret = starpu_init(NULL);
+     if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+     dotest(&cuda_cl);
+#endif
+#ifdef STARPU_USE_OPENCL
+     dotest(&opencl_cl);
+#endif
+
+     starpu_shutdown();
+
+     return 0;
+
+enodev:
+     starpu_shutdown();
+     /* yes, we do not perform the computation but we did detect that no one
+      * could perform the kernel, so this is not an error from StarPU */
+     fprintf(stderr, "WARNING: No one can execute this task\n");
+     return STARPU_TEST_SKIPPED;
+}

+ 9 - 10
tests/datawizard/gpu_ptr_register.c

@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 static int
 find_a_worker(enum starpu_worker_archtype type)
 find_a_worker(enum starpu_worker_archtype type)
 {
 {
-	int worker;
+	int worker[STARPU_NMAXWORKERS];
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 	if (ret == 0)
 		return -ENODEV;
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 }
 
 
 static int
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 static int
 test_cuda(void)
 test_cuda(void)
 {
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 	size = 10 * n;
 
 
 	devid = starpu_worker_get_devid(chosen);
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
 
 
 	foo = calloc(size, sizeof(*foo));
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -180,9 +181,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 	starpu_opencl_get_queue(devid, &queue);
 
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -261,7 +260,7 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	ret = test_cuda();
 	if (ret == 1)
 	if (ret == 1)
 		goto fail;
 		goto fail;

+ 10 - 11
tests/datawizard/gpu_register.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  * Copyright (C) 2012 inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 static int
 find_a_worker(enum starpu_worker_archtype type)
 find_a_worker(enum starpu_worker_archtype type)
 {
 {
-	int worker;
+	int worker[STARPU_NMAXWORKERS];
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 	if (ret == 0)
 		return -ENODEV;
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 }
 
 
 static int
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 static int
 test_cuda(void)
 test_cuda(void)
 {
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 	size = 10 * n;
 
 
 	devid = starpu_worker_get_devid(chosen);
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
 
 
 	foo = calloc(size, sizeof(*foo));
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -182,9 +183,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 	starpu_opencl_get_queue(devid, &queue);
 
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -269,7 +268,7 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	ret = test_cuda();
 	if (ret == 1)
 	if (ret == 1)
 		goto fail;
 		goto fail;

+ 10 - 0
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -200,6 +200,16 @@ run(struct starpu_sched_policy *policy)
 	if (cpu_task_worker != STARPU_CPU_WORKER || (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER))
 	if (cpu_task_worker != STARPU_CPU_WORKER || (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER))
 	{
 	{
 		FPRINTF(stderr, "Tasks did not execute on expected worker\n");
 		FPRINTF(stderr, "Tasks did not execute on expected worker\n");
+	{
+		if (cpu_task_worker != STARPU_CPU_WORKER)
+		{
+			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
+		}
+		if (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER)
+		{
+			FPRINTF(stderr, "The GPU task did not run on a Cuda or OpenCL worker\n");
+		}
+
 		ret = 1;
 		ret = 1;
 	}
 	}
 	else
 	else