浏览代码

merge trunk

Samuel Thibault 10 年之前
父节点
当前提交
8a1120b2d1
共有 41 个文件被更改,包括 540 次插入252 次删除
  1. 2 1
      Makefile.am
  2. 2 0
      configure.ac
  3. 7 5
      doc/doxygen/chapters/13offline_performance_tools.doxy
  4. 2 2
      doc/doxygen/chapters/21simgrid.doxy
  5. 6 4
      doc/doxygen/chapters/api/data_interfaces.doxy
  6. 2 1
      examples/basic_examples/vector_scal_c.c
  7. 1 1
      examples/filters/custom_mf/custom_interface.c
  8. 6 0
      examples/lu/lu_example.c
  9. 1 6
      include/starpu.h
  10. 1 1
      include/starpu_data_interfaces.h
  11. 2 2
      include/starpu_opencl.h
  12. 28 0
      include/starpu_simgrid_wrap.h
  13. 17 17
      mpi/src/starpu_mpi.c
  14. 76 76
      mpi/src/starpu_mpi_fxt.h
  15. 6 5
      src/common/starpu_spinlock.c
  16. 1 2
      src/common/starpu_spinlock.h
  17. 21 19
      src/common/utils.c
  18. 2 2
      src/core/perfmodel/perfmodel_history.c
  19. 2 0
      src/core/sched_policy.c
  20. 23 2
      src/core/simgrid.c
  21. 11 3
      src/core/workers.c
  22. 3 0
      src/core/workers.h
  23. 42 23
      src/datawizard/coherency.c
  24. 11 6
      src/datawizard/data_request.c
  25. 18 2
      src/datawizard/malloc.c
  26. 10 5
      src/datawizard/memalloc.c
  27. 19 19
      src/debug/traces/starpu_fxt.c
  28. 1 1
      src/debug/traces/starpu_fxt_mpi.c
  29. 3 0
      src/drivers/cpu/driver_cpu.c
  30. 8 0
      src/drivers/cuda/driver_cuda.c
  31. 1 0
      src/drivers/gordon/driver_gordon.c
  32. 2 0
      src/drivers/mp_common/source_common.c
  33. 11 4
      src/drivers/opencl/driver_opencl.c
  34. 29 9
      src/sched_policies/eager_central_policy.c
  35. 29 9
      src/sched_policies/eager_central_priority_policy.c
  36. 3 4
      src/sched_policies/locality_work_stealing_policy.c
  37. 1 0
      tests/Makefile.am
  38. 100 0
      tests/datawizard/cache.c
  39. 9 10
      tests/datawizard/gpu_ptr_register.c
  40. 10 11
      tests/datawizard/gpu_register.c
  41. 11 0
      tests/sched_policies/simple_cpu_gpu_sched.c

+ 2 - 1
Makefile.am

@@ -85,7 +85,8 @@ versinclude_HEADERS = 				\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\
 	include/starpu_thread_util.h		\
-	include/starpu_tree.h
+	include/starpu_tree.h			\
+	include/starpu_simgrid_wrap.h
 
 nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h

+ 2 - 0
configure.ac

@@ -1820,6 +1820,8 @@ AC_SUBST(USE_MPI, $use_mpi)
 AM_CONDITIONAL(USE_MPI, test x$use_mpi = xyes)
 if test x$use_mpi = xyes; then
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
+else
+	running_mpi_check=no
 fi
 
 AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],

+ 7 - 5
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -119,8 +119,9 @@ $ vite paje.trace
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
 Details of the codelet execution can be obtained by passing
-<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
-(at least r1430).
+\ref enable-paje-codelet-details "--enable-paje-codelet-details" when
+configuring StarPU and using a recent enough version of ViTE (at least
+r1430).
 
 In the MPI execution case, collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
@@ -133,9 +134,10 @@ By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 
 To identify tasks precisely, the application can set the ::tag_id field of the
-tasks (or use STARPU_TAG_ONY when using starpu_task_insert), and with a recent
-enough version of vite (>= r1430) and the <c>--enable-paje-codelet-details</c>
-configure option, the value of the tag will show up in the trace.
+tasks (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
+enough version of vite (>= r1430) and the
+\ref enable-paje-codelet-details "--enable-paje-codelet-details"
+StarPU configure option, the value of the tag will show up in the trace.
 
 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
 

+ 2 - 2
doc/doxygen/chapters/21simgrid.doxy

@@ -22,9 +22,9 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 virtual timestamp in us.
 
 For some technical reason, the application's .c file which contains main() has
-to be recompiled with starpu.h, which in the simgrid case will # define main()
+to be recompiled with starpu_simgrid_wrap.h, which in the simgrid case will # define main()
 into starpu_main(), and it is libstarpu which will provide the real main() and
-call the application's main().
+will call the application's main().
 
 To be able to test with crazy data sizes, one may want to only allocate
 application data if STARPU_SIMGRID is not defined.  Passing a NULL pointer to

+ 6 - 4
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -57,8 +57,9 @@ case of e.g. available particular CUDA or OpenCL support.
 \ingroup API_Data_Interfaces
 \var starpu_data_copy_methods::can_copy
 If defined, allows the interface to declare whether it supports transferring
-from \p src_interface on node \p src_node to \p dst_interface on node \p. If not
-defined, it is assumed that the interface supports all transfers.
+from \p src_interface on node \p src_node to \p dst_interface on node \p
+dst_node, run from node \p handling_node. If not defined, it is assumed that the
+interface supports all transfers.
 \var starpu_data_copy_methods::ram_to_ram
 Define how to copy data from the \p src_interface interface on the \p
 src_node CPU node to the \p dst_interface interface on the \p dst_node
@@ -1000,11 +1001,12 @@ DefiningANewDataInterface.
 \ingroup API_Data_Interfaces
 Allocate \p size bytes on node \p dst_node. This returns 0 if
 allocation failed, the allocation method should then return <c>-ENOMEM</c> as
-allocated size.
+allocated size. Deallocation must be done with starpu_free_on_node.
 
 \fn void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 \ingroup API_Data_Interfaces
-Free \p addr of \p size bytes on node \p dst_node.
+Free \p addr of \p size bytes on node \p dst_node which was previously allocated
+with starpu_malloc_on_node.
 
 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
 \ingroup API_Data_Interfaces

+ 2 - 1
examples/basic_examples/vector_scal_c.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 1,
 	.model = &vector_scal_model

+ 1 - 1
examples/filters/custom_mf/custom_interface.c

@@ -425,7 +425,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 	size = src_custom->nx * 2 * sizeof(float);
 	if (dst_custom->cpu_ptr == NULL)
 	{
-		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
+		ret = starpu_opencl_allocate_memory(devid, (cl_mem*)&dst_custom->cpu_ptr,
 				size, CL_MEM_READ_WRITE);
 		assert(ret == CL_SUCCESS);
 	}

+ 6 - 0
examples/lu/lu_example.c

@@ -58,6 +58,7 @@ static void parse_args(int argc, char **argv)
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
 
+#ifndef STARPU_SIMGRID
 		else if (strcmp(argv[i], "-check") == 0)
 		{
 			check = 1;
@@ -72,6 +73,7 @@ static void parse_args(int argc, char **argv)
 		{
 			no_stride = 1;
 		}
+#endif
 
 		else if (strcmp(argv[i], "-profile") == 0)
 		{
@@ -315,6 +317,7 @@ int main(int argc, char **argv)
 
 	starpu_cublas_init();
 
+#ifndef STARPU_SIMGRID
 	init_matrix();
 
 	unsigned *ipiv = NULL;
@@ -364,6 +367,7 @@ int main(int argc, char **argv)
 		}
 	}
 	else
+#endif
 	{
 		ret = STARPU_LU(lu_decomposition)(A, size, size, nblocks);
 	}
@@ -403,6 +407,7 @@ int main(int argc, char **argv)
 		}
 	}
 
+#ifndef STARPU_SIMGRID
 	if (check)
 	{
 		FPRINTF(stderr, "Checking result\n");
@@ -413,6 +418,7 @@ int main(int argc, char **argv)
 
 		check_result();
 	}
+#endif
 
 	starpu_free(A);
 

+ 1 - 6
include/starpu.h

@@ -66,18 +66,13 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_fxt.h>
 #include <starpu_driver.h>
 #include <starpu_tree.h>
+#include <starpu_simgrid_wrap.h>
 
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 
-#ifdef STARPU_SIMGRID
-#ifndef main
-#define main starpu_main
-#endif
-#endif
-
 struct starpu_conf
 {
 	int magic;

+ 1 - 1
include/starpu_data_interfaces.h

@@ -37,7 +37,7 @@ extern "C"
 
 struct starpu_data_copy_methods
 {
-	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node);
 
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);

+ 2 - 2
include/starpu_opencl.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -76,7 +76,7 @@ int starpu_opencl_collect_stats(cl_event event);
 
 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
-cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *addr, size_t size, cl_mem_flags flags);
 
 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 

+ 28 - 0
include/starpu_simgrid_wrap.h

@@ -0,0 +1,28 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_SIMGRID_WRAP_H__
+#define __STARPU_SIMGRID_WRAP_H__
+
+#include <starpu_config.h>
+
+#ifdef STARPU_SIMGRID
+#ifndef main
+#define main starpu_main
+#endif
+#endif
+
+#endif /* __STARPU_SIMGRID_WRAP_H__ */

+ 17 - 17
mpi/src/starpu_mpi.c

@@ -173,12 +173,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 
-	 TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
 
 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
 
-	 TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
 
 	 /* somebody is perhaps waiting for the MPI request to be posted */
 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -304,12 +304,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
-	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
 
 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
 
-	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -413,12 +413,12 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	/* Which is the mpi request we are waiting for ? */
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 
-	TRACE_MPI_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
 
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
 
-	TRACE_MPI_UWAIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_END(req->srcdst, req->mpi_tag);
 
 	_starpu_mpi_handle_request_termination(req);
 	_STARPU_MPI_LOG_OUT();
@@ -481,12 +481,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
-	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
 
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
 
-	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_END(req->srcdst, req->mpi_tag);
 
 	if (*testing_req->flag)
 	{
@@ -933,22 +933,22 @@ static void _starpu_mpi_test_detached_requests(void)
 		{
 			if (req->request_type == RECV_REQ)
 			{
-				TRACE_MPI_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
 			}
 			else if (req->request_type == SEND_REQ)
 			{
-				TRACE_MPI_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
 			}
 
 			_starpu_mpi_handle_request_termination(req);
 
 			if (req->request_type == RECV_REQ)
 			{
-				TRACE_MPI_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
 			}
 			else if (req->request_type == SEND_REQ)
 			{
-				TRACE_MPI_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
 			}
 		}
 
@@ -1057,7 +1057,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
 
 	{
-		TRACE_MPI_START(rank, worldsize);
+		_STARPU_MPI_TRACE_START(rank, worldsize);
 #ifdef STARPU_USE_FXT
 		starpu_profiling_set_id(rank);
 #endif //STARPU_USE_FXT
@@ -1097,14 +1097,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		{
 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
 
-			TRACE_MPI_SLEEP_BEGIN();
+			_STARPU_MPI_TRACE_SLEEP_BEGIN();
 
 			if (barrier_running)
 				/* Tell mpi_barrier */
 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 
-			TRACE_MPI_SLEEP_END();
+			_STARPU_MPI_TRACE_SLEEP_END();
 		}
 
 		/* get one request */
@@ -1323,7 +1323,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
 
-	TRACE_MPI_BARRIER(rank, worldsize, random_number);
+	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
 #endif
@@ -1407,7 +1407,7 @@ int starpu_mpi_shutdown(void)
 	starpu_progression_hook_deregister(hookid);
 #endif /* STARPU_MPI_ACTIVITY */
 
-	TRACE_MPI_STOP(rank, world_size);
+	_STARPU_MPI_TRACE_STOP(rank, world_size);
 
 	/* free the request queues */
 	_starpu_mpi_req_list_delete(detached_requests);

+ 76 - 76
mpi/src/starpu_mpi_fxt.h

@@ -26,86 +26,86 @@
 extern "C" {
 #endif
 
-#define FUT_MPI_START				0x5201
-#define FUT_MPI_STOP				0x5202
-#define FUT_MPI_BARRIER				0x5203
-#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
-#define FUT_MPI_ISEND_SUBMIT_END		0x5205
-#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
-#define FUT_MPI_IRECV_SUBMIT_END		0x5207
-#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
-#define FUT_MPI_ISEND_COMPLETE_END		0x5209
-#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
-#define FUT_MPI_IRECV_COMPLETE_END		0x5211
-#define FUT_MPI_SLEEP_BEGIN			0x5212
-#define FUT_MPI_SLEEP_END			0x5213
-#define FUT_MPI_DTESTING_BEGIN			0x5214
-#define FUT_MPI_DTESTING_END			0x5215
-#define FUT_MPI_UTESTING_BEGIN			0x5216
-#define FUT_MPI_UTESTING_END			0x5217
-#define FUT_MPI_UWAIT_BEGIN			0x5218
-#define FUT_MPI_UWAIT_END			0x5219
+#define _STARPU_MPI_FUT_START				0x5201
+#define _STARPU_MPI_FUT_STOP				0x5202
+#define _STARPU_MPI_FUT_BARRIER				0x5203
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN		0x5204
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_END		0x5205
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN		0x5206
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_END		0x5207
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN		0x5208
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_END		0x5209
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN		0x5210
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_END		0x5211
+#define _STARPU_MPI_FUT_SLEEP_BEGIN			0x5212
+#define _STARPU_MPI_FUT_SLEEP_END			0x5213
+#define _STARPU_MPI_FUT_DTESTING_BEGIN			0x5214
+#define _STARPU_MPI_FUT_DTESTING_END			0x5215
+#define _STARPU_MPI_FUT_UTESTING_BEGIN			0x5216
+#define _STARPU_MPI_FUT_UTESTING_END			0x5217
+#define _STARPU_MPI_FUT_UWAIT_BEGIN			0x5218
+#define _STARPU_MPI_FUT_UWAIT_END			0x5219
 
 #ifdef STARPU_USE_FXT
-#define TRACE_MPI_START(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_STOP(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
-	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_SLEEP_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
-#define TRACE_MPI_SLEEP_END()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
-#define TRACE_MPI_DTESTING_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
-#define TRACE_MPI_DTESTING_END()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
-#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_START(rank, worldsize)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
+#define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
+#define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
+#define _STARPU_MPI_TRACE_SLEEP_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
+#define _STARPU_MPI_TRACE_DTESTING_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define _STARPU_MPI_TRACE_UTESTING_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
 #define TRACE
 #else
-#define TRACE_MPI_START(a, b)				do {} while(0);
-#define TRACE_MPI_STOP(a, b)				do {} while(0);
-#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
-#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
-#define TRACE_MPI_SLEEP_END()				do {} while(0);
-#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
-#define TRACE_MPI_DTESTING_END()			do {} while(0);
-#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()				do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_END()				do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()			do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_END()			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
 #endif
 
 #ifdef __cplusplus

+ 6 - 5
src/common/starpu_spinlock.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,15 +24,17 @@
 int _starpu_spin_init(struct _starpu_spinlock *lock)
 {
 #if defined(STARPU_SPINLOCK_CHECK)
+	starpu_pthread_mutexattr_t errcheck_attr;
 //	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
 	int ret;
-	ret = starpu_pthread_mutexattr_init(&lock->errcheck_attr);
+	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
 	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
 
-	ret = starpu_pthread_mutexattr_settype(&lock->errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
+	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
 	STARPU_ASSERT(!ret);
 
-	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &lock->errcheck_attr);
+	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &errcheck_attr);
+	starpu_pthread_mutexattr_destroy(&errcheck_attr);
 	return ret;
 #else
 	int ret = starpu_pthread_spin_init(&lock->lock, 0);
@@ -44,7 +46,6 @@ int _starpu_spin_init(struct _starpu_spinlock *lock)
 int _starpu_spin_destroy(struct _starpu_spinlock *lock STARPU_ATTRIBUTE_UNUSED)
 {
 #if defined(STARPU_SPINLOCK_CHECK)
-	starpu_pthread_mutexattr_destroy(&lock->errcheck_attr);
 	return starpu_pthread_mutex_destroy(&lock->errcheck_lock);
 #else
 	return starpu_pthread_spin_destroy(&lock->lock);

+ 1 - 2
src/common/starpu_spinlock.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,7 +25,6 @@
 struct _starpu_spinlock
 {
 #if defined(STARPU_SPINLOCK_CHECK)
-	starpu_pthread_mutexattr_t errcheck_attr;
 	starpu_pthread_mutex_t errcheck_lock;
 	const char *last_taker;
 #else

+ 21 - 19
src/common/utils.c

@@ -83,10 +83,24 @@ int _starpu_mkpath(const char *s, mode_t mode)
 	if ((_starpu_mkpath(up, mode) == -1) && (errno != EEXIST))
 		goto out;
 
-	if ((mkdir(path, mode) == -1) && (errno != EEXIST))
-		rv = -1;
-	else
+	struct stat sb;
+	if (stat(path, &sb) == 0)
+	{
+		if (!S_ISDIR(sb.st_mode))
+		{
+			fprintf(stderr,"Error: %s is not a directory:\n", path);
+			STARPU_ABORT();
+		}
+		/* It already exists and is a directory.  */
 		rv = 0;
+	}
+	else
+	{
+		if ((mkdir(path, mode) == -1) && (errno != EEXIST))
+			rv = -1;
+		else
+			rv = 0;
+	}
 
 out:
 	olderrno = errno;
@@ -105,23 +119,11 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 
 	ret = _starpu_mkpath(path, mode);
 
-	if (ret == -1)
+	if (ret == -1 && errno != EEXIST)
 	{
-		if (errno != EEXIST)
-		{
-			fprintf(stderr,"Error making StarPU directory %s:\n", path);
-			perror("mkdir");
-			STARPU_ABORT();
-		}
-
-		/* make sure that it is actually a directory */
-		struct stat sb;
-		stat(path, &sb);
-		if (!S_ISDIR(sb.st_mode))
-		{
-			fprintf(stderr,"Error: %s is not a directory:\n", path);
-			STARPU_ABORT();
-		}
+		fprintf(stderr,"Error making StarPU directory %s:\n", path);
+		perror("mkdir");
+		STARPU_ABORT();
 	}
 }
 

+ 2 - 2
src/core/perfmodel/perfmodel_history.c

@@ -1230,7 +1230,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 			char archname[32];
 
 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-			_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry && entry->history_entry ? entry->history_entry->nsample : 0);
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 		}
@@ -1272,7 +1272,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 		char archname[32];
 
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry ? entry->nsample : 0);
 		_starpu_set_calibrate_flag(1);
 		model->benchmarking = 1;
 	}

+ 2 - 0
src/core/sched_policy.c

@@ -841,6 +841,7 @@ pick:
 	 * We do have a task that uses multiformat handles. Let's create the
 	 * required conversion tasks.
 	 */
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 	unsigned i;
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	for (i = 0; i < nbuffers; i++)
@@ -864,6 +865,7 @@ pick:
 
 	task->mf_skip = 1;
 	starpu_task_list_push_back(&worker->local_tasks, task);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 	goto pick;
 
 profiling:

+ 23 - 2
src/core/simgrid.c

@@ -27,6 +27,7 @@
 #ifdef STARPU_SIMGRID
 #include <msg/msg.h>
 #include <smpi/smpif.h>
+#include <sys/resource.h>
 
 #define STARPU_MPI_AS_PREFIX "StarPU-MPI"
 
@@ -160,7 +161,7 @@ int main(int argc, char **argv)
 
 	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
 	{
-		_STARPU_ERROR("The main file of this application needs to be compiled with starpu.h included, to properly define starpu_main\n");
+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
 		exit(EXIT_FAILURE);
 	}
 
@@ -178,7 +179,12 @@ int main(int argc, char **argv)
 #endif
 	/* Simgrid uses tiny stacks by default.  This comes unexpected to our users.  */
 	extern xbt_cfg_t _sg_cfg_set;
-	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", 8192);
+	unsigned stack_size = 8192;
+	struct rlimit rlim;
+	if (getrlimit(RLIMIT_STACK, &rlim) == 0 && rlim.rlim_cur != 0 && rlim.rlim_cur != RLIM_INFINITY)
+		stack_size = rlim.rlim_cur / 1024;
+
+	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", stack_size);
 
 	/* Load XML platform */
 	_starpu_simgrid_get_platform_path(path, sizeof(path));
@@ -196,6 +202,12 @@ void _starpu_simgrid_init()
 	xbt_dynar_t hosts;
 	int i;
 
+	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
+	{
+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
+		exit(EXIT_FAILURE);
+	}
+
 #ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	{
@@ -253,6 +265,10 @@ void _starpu_simgrid_init()
 	xbt_dynar_free(&hosts);
 }
 
+/*
+ * Tasks
+ */
+
 /* Task execution submitted by StarPU */
 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
 {
@@ -276,8 +292,13 @@ void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_
 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
 			0, NULL);
 	MSG_task_execute(simgrid_task);
+	MSG_task_destroy(simgrid_task);
 }
 
+/*
+ * Transfers
+ */
+
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 LIST_TYPE(transfer,
 	msg_task_t task;

+ 11 - 3
src/core/workers.c

@@ -1218,16 +1218,17 @@ out:
 			STARPU_ASSERT(worker->local_ordered_tasks[n] == NULL);
 		_starpu_sched_ctx_list_delete(&worker->sched_ctx_list);
 		_starpu_job_list_delete(worker->terminated_jobs);
+		free(worker->local_ordered_tasks);
 	}
 }
 
 /* Condition variable and mutex used to pause/resume. */
 static starpu_pthread_cond_t pause_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t pause_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-unsigned _starpu_machine_is_running(void)
+
+void _starpu_may_pause(void)
 {
-	unsigned ret;
-	/* running and pause_depth are just protected by a memory barrier */
+	/* pause_depth is just protected by a memory barrier */
 	STARPU_RMB();
 
 	if (STARPU_UNLIKELY(config.pause_depth > 0)) {
@@ -1237,6 +1238,13 @@ unsigned _starpu_machine_is_running(void)
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
 	}
+}
+
+unsigned _starpu_machine_is_running(void)
+{
+	unsigned ret;
+	/* running is just protected by a memory barrier */
+	STARPU_RMB();
 
 	ANNOTATE_HAPPENS_AFTER(&config.running);
 	ret = config.running;

+ 3 - 0
src/core/workers.h

@@ -354,6 +354,9 @@ char ***_starpu_get_argv();
 /* Fill conf with environment variables */
 void _starpu_conf_check_environment(struct starpu_conf *conf);
 
+/* Called by the driver when it is ready to pause  */
+void _starpu_may_pause(void);
+
 /* Has starpu_shutdown already been called ? */
 unsigned _starpu_machine_is_running(void);
 

+ 42 - 23
src/datawizard/coherency.c

@@ -41,8 +41,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	double cost = INFINITY;
 	unsigned src_node_mask = 0;
 
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
 	for (node = 0; node < nnodes; node++)
 	{
 		if (handle->per_node[node].state != STARPU_INVALID)
@@ -74,15 +72,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 				double time = starpu_transfer_predict(i, destination, size);
 				unsigned handling_node;
 
-				/* Avoid transfers which the interface does not want */
-				if (copy_methods->can_copy)
-				{
-					void *src_interface = handle->per_node[i].data_interface;
-					void *dst_interface = handle->per_node[destination].data_interface;
-					if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
-						continue;
-				}
-
 				/* Avoid indirect transfers */
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 					continue;
@@ -115,22 +104,22 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 		
 		if (src_node_mask & (1<<i))
 		{
+			int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
 			/* Avoid transfers which the interface does not want */
-			if (copy_methods->can_copy)
+			if (can_copy)
 			{
 				void *src_interface = handle->per_node[i].data_interface;
 				void *dst_interface = handle->per_node[destination].data_interface;
 				unsigned handling_node;
 
-				if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
-					continue;
-
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 				{
 					/* Avoid through RAM if the interface does not want it */
 					void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
-					if (!copy_methods->can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM)
-					 || !copy_methods->can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination))
+					if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
+					  && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
+					 || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
+					  && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
 						continue;
 				}
 			}
@@ -251,7 +240,9 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
 {
-	(void) handle; // unused
+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
+	void *src_interface = handle->per_node[src_node].data_interface;
+	void *dst_interface = handle->per_node[dst_node].data_interface;
 
 	/* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
@@ -260,19 +251,19 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 	{
 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-		if (!copy_methods->cuda_to_cuda_async)
+		if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
 			return 0;
 	}
 #endif
 
 	/* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
-	if (worker_supports_direct_access(src_node, dst_node))
+	if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
 	{
 		*handling_node = dst_node;
 		return 1;
 	}
 
-	if (worker_supports_direct_access(dst_node, src_node))
+	if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
 	{
 		*handling_node = src_node;
 		return 1;
@@ -319,6 +310,10 @@ static int determine_request_path(starpu_data_handle_t handle,
 
 	if (!link_is_valid)
 	{
+		int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
+		void *src_interface = handle->per_node[src_node].data_interface;
+		void *dst_interface = handle->per_node[dst_node].data_interface;
+
 		/* We need an intermediate hop to implement data staging
 		 * through main memory. */
 		STARPU_ASSERT(max_len >= 2);
@@ -326,12 +321,36 @@ static int determine_request_path(starpu_data_handle_t handle,
 		/* GPU -> RAM */
 		src_nodes[0] = src_node;
 		dst_nodes[0] = STARPU_MAIN_RAM;
-		handling_nodes[0] = starpu_node_get_kind(src_node) == STARPU_DISK_RAM ? dst_node : src_node;
+
+		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
+			/* Disks don't have their own driver thread */
+			handling_nodes[0] = dst_node;
+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
+		{
+			handling_nodes[0] = src_node;
+		}
+		else
+		{
+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
+			handling_nodes[0] = dst_node;
+		}
 
 		/* RAM -> GPU */
 		src_nodes[1] = STARPU_MAIN_RAM;
 		dst_nodes[1] = dst_node;
-		handling_nodes[1] = starpu_node_get_kind(dst_node) == STARPU_DISK_RAM ? src_node : dst_node;
+
+		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
+			/* Disks don't have their own driver thread */
+			handling_nodes[1] = src_node;
+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
+		{
+			handling_nodes[1] = dst_node;
+		}
+		else
+		{
+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
+			handling_nodes[1] = src_node;
+		}
 
 		return 2;
 	}

+ 11 - 6
src/datawizard/data_request.c

@@ -155,17 +155,22 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 {
 	int retval;
 	int do_delete = 0;
+	int completed;
 
 	unsigned local_node = _starpu_memory_node_get_local_key();
 
 	do
 	{
-		_starpu_spin_lock(&r->lock);
-
-		if (r->completed)
-			break;
-
-		_starpu_spin_unlock(&r->lock);
+		STARPU_HG_DISABLE_CHECKING(r->completed);
+		completed = r->completed;
+		STARPU_HG_ENABLE_CHECKING(r->completed);
+		if (completed)
+		{
+			_starpu_spin_lock(&r->lock);
+			if (r->completed)
+				break;
+			_starpu_spin_unlock(&r->lock);
+		}
 
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 		_starpu_wake_all_blocked_workers_on_node(r->handling_node);

+ 18 - 2
src/datawizard/malloc.c

@@ -50,7 +50,7 @@ struct malloc_pinned_codelet_struct
 //{
 //	struct malloc_pinned_codelet_struct *s = arg;
 //        //        *(s->ptr) = malloc(s->dim);
-//        starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
+//        starpu_opencl_allocate_memory(devid, (void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
 //}
 //#endif
 
@@ -404,6 +404,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 			STARPU_ASSERT(last[dst_node] >= addr);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			status = cudaMalloc((void **)&addr, size);
 			if (!addr || (status != cudaSuccess))
 			{
@@ -433,7 +441,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
                                 int ret;
 				cl_mem ptr;
 
-				ret = starpu_opencl_allocate_memory(&ptr, size, CL_MEM_READ_WRITE);
+				ret = starpu_opencl_allocate_memory(_starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
 				if (ret)
 				{
 					addr = 0;
@@ -505,6 +513,14 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 			cudaError_t err;
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			err = cudaFree((void*)addr);
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(err);

+ 10 - 5
src/datawizard/memalloc.c

@@ -440,19 +440,24 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
 	if (!old_replicate)
 	{
+		/* Free the copy that we made */
 		free(mc->chunk_interface);
 		mc->chunk_interface = NULL;
 	}
 
-	mc->data = new_replicate->handle;
-	/* mc->ops, mc->footprint and mc->interface should be
+	/* XXX: We do not actually reuse the mc at the moment, only the interface */
+
+	/* mc->data = new_replicate->handle; */
+	/* mc->footprint, mc->ops, mc->size_interface, mc->automatically_allocated should be
  	 * unchanged ! */
 
-	/* reinsert the mem chunk in the list of active memory chunks */
-	if (!is_already_in_mc_list)
+	/* remove the mem chunk from the list of active memory chunks, register_mem_chunk will put it back later */
+	if (is_already_in_mc_list)
 	{
-		_starpu_mem_chunk_list_push_back(mc_list[node], mc);
+		_starpu_mem_chunk_list_erase(mc_list[node], mc);
 	}
+
+	free(mc);
 }
 
 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)

+ 19 - 19
src/debug/traces/starpu_fxt.c

@@ -1893,79 +1893,79 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_user_event(&ev, options);
 				break;
 
-			case FUT_MPI_START:
+			case _STARPU_MPI_FUT_START:
 				handle_mpi_start(&ev, options);
 				break;
 
-			case FUT_MPI_STOP:
+			case _STARPU_MPI_FUT_STOP:
 				handle_mpi_stop(&ev, options);
 				break;
 
-			case FUT_MPI_BARRIER:
+			case _STARPU_MPI_FUT_BARRIER:
 				handle_mpi_barrier(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN:
 				handle_mpi_isend_submit_begin(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_SUBMIT_END:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_END:
 				handle_mpi_isend_submit_end(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN:
 				handle_mpi_irecv_submit_begin(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_SUBMIT_END:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_END:
 				handle_mpi_irecv_submit_end(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN:
 				handle_mpi_isend_complete_begin(&ev, options);
 				break;
 
-			case FUT_MPI_ISEND_COMPLETE_END:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_END:
 				handle_mpi_isend_complete_end(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN:
 				handle_mpi_irecv_complete_begin(&ev, options);
 				break;
 
-			case FUT_MPI_IRECV_COMPLETE_END:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_END:
 				handle_mpi_irecv_complete_end(&ev, options);
 				break;
 
-			case FUT_MPI_SLEEP_BEGIN:
+			case _STARPU_MPI_FUT_SLEEP_BEGIN:
 				handle_mpi_sleep_begin(&ev, options);
 				break;
 
-			case FUT_MPI_SLEEP_END:
+			case _STARPU_MPI_FUT_SLEEP_END:
 				handle_mpi_sleep_end(&ev, options);
 				break;
 
-			case FUT_MPI_DTESTING_BEGIN:
+			case _STARPU_MPI_FUT_DTESTING_BEGIN:
 				handle_mpi_dtesting_begin(&ev, options);
 				break;
 
-			case FUT_MPI_DTESTING_END:
+			case _STARPU_MPI_FUT_DTESTING_END:
 				handle_mpi_dtesting_end(&ev, options);
 				break;
 
-			case FUT_MPI_UTESTING_BEGIN:
+			case _STARPU_MPI_FUT_UTESTING_BEGIN:
 				handle_mpi_utesting_begin(&ev, options);
 				break;
 
-			case FUT_MPI_UTESTING_END:
+			case _STARPU_MPI_FUT_UTESTING_END:
 				handle_mpi_utesting_end(&ev, options);
 				break;
 
-			case FUT_MPI_UWAIT_BEGIN:
+			case _STARPU_MPI_FUT_UWAIT_BEGIN:
 				handle_mpi_uwait_begin(&ev, options);
 				break;
 
-			case FUT_MPI_UWAIT_END:
+			case _STARPU_MPI_FUT_UWAIT_END:
 				handle_mpi_uwait_end(&ev, options);
 				break;
 

+ 1 - 1
src/debug/traces/starpu_fxt_mpi.c

@@ -74,7 +74,7 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 			break;
 		}
 
-		if (ev.code == FUT_MPI_BARRIER)
+		if (ev.code == _STARPU_MPI_FUT_BARRIER)
 		{
 			/* We found the sync point */
 			*offset = ev.time;

+ 3 - 0
src/drivers/cpu/driver_cpu.c

@@ -301,7 +301,10 @@ _starpu_cpu_worker(void *arg)
 
 	_starpu_cpu_driver_init(args);
 	while (_starpu_machine_is_running())
+	{
+		_starpu_may_pause();
 		_starpu_cpu_driver_run_once(args);
+	}
 	_starpu_cpu_driver_deinit(args);
 
 	return NULL;

+ 8 - 0
src/drivers/cuda/driver_cuda.c

@@ -567,6 +567,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
 
 		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
+		if (worker->pipeline_length > STARPU_MAX_PIPELINE)
+		{
+			_STARPU_DISP("Warning: STARPU_CUDA_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
+			worker->pipeline_length = STARPU_MAX_PIPELINE;
+		}
 		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
 	}
 
@@ -752,7 +757,10 @@ void *_starpu_cuda_worker(void *_arg)
 	_starpu_cuda_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
+	{
+		_starpu_may_pause();
 		_starpu_cuda_driver_run_once(worker);
+	}
 	_STARPU_TRACE_END_PROGRESS(memnode);
 	_starpu_cuda_driver_deinit(worker);
 

+ 1 - 0
src/drivers/gordon/driver_gordon.c

@@ -343,6 +343,7 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 
 	while(_starpu_machine_is_running())
 	{
+		_starpu_may_pause();
 		if (gordon_busy_enough())
 		{
 			/* gordon already has enough work, wait a little TODO */

+ 2 - 0
src/drivers/mp_common/source_common.c

@@ -683,6 +683,8 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
 		int res;
 		struct _starpu_job * j;
 
+		_starpu_may_pause();
+
 		_STARPU_TRACE_START_PROGRESS(memnode);
 		_starpu_datawizard_progress(memnode, 1);
 		_STARPU_TRACE_END_PROGRESS(memnode);

+ 11 - 4
src/drivers/opencl/driver_opencl.c

@@ -214,16 +214,15 @@ cl_int _starpu_opencl_deinit_context(int devid)
 }
 #endif
 
-cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_SIMGRID
 	STARPU_ABORT();
 #else
 	cl_int err;
         cl_mem memory;
-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
-	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -234,7 +233,7 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t
 	 */
 	char dummy = 0;
 	cl_event ev;
-	err = clEnqueueWriteBuffer(alloc_queues[worker->devid], memory, CL_TRUE,
+	err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
 				   0, sizeof(dummy), &dummy,
 				   0, NULL, &ev);
 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
@@ -597,6 +596,11 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
 
 	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
+	if (worker->pipeline_length > STARPU_MAX_PIPELINE)
+	{
+		_STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
+		worker->pipeline_length = STARPU_MAX_PIPELINE;
+	}
 
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
 
@@ -741,7 +745,10 @@ void *_starpu_opencl_worker(void *_arg)
 	_starpu_opencl_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
+	{
+		_starpu_may_pause();
 		_starpu_opencl_driver_run_once(worker);
+	}
 	_starpu_opencl_driver_deinit(worker);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 

+ 29 - 9
src/sched_policies/eager_central_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  *
@@ -91,6 +91,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	
 	struct starpu_sched_ctx_iterator it;
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	char dowake[STARPU_NMAXWORKERS] = { 0 };
+#endif
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 	
@@ -112,20 +115,35 @@ static int push_task_eager_policy(struct starpu_task *task)
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 				starpu_bitmap_unset(data->waiters, worker);
 				/* We really woke at least somebody, no need to wake somebody else */
-				goto out;
+				break;
 #else
-				starpu_pthread_mutex_t *sched_mutex;
-				starpu_pthread_cond_t *sched_cond;
-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
-
-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
-				    goto out; // wake up a single worker
+				dowake[worker] = 1;
 #endif
 			}
 	}
-out:
+	/* Let the task free */
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	/* Now that we have a list of potential workers, try to wake one */
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+	
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		if (dowake[worker])
+		{
+			starpu_pthread_mutex_t *sched_mutex;
+			starpu_pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+
+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
+				break; // wake up a single worker
+		}
+	}
+#endif
+
 	return 0;
 }
 
@@ -154,9 +172,11 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 	if (_starpu_fifo_empty(data->fifo))
 		return NULL;
 
+#ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (starpu_bitmap_get(data->waiters, workerid))
 		/* Nobody woke us, avoid bothering the mutex */
 		return NULL;
+#endif
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 

+ 29 - 9
src/sched_policies/eager_central_priority_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  *
@@ -139,6 +139,9 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	
 	struct starpu_sched_ctx_iterator it;
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	char dowake[STARPU_NMAXWORKERS] = { 0 };
+#endif
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 	
@@ -160,20 +163,35 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 				starpu_bitmap_unset(data->waiters, worker);
 				/* We really woke at least somebody, no need to wake somebody else */
-				goto out;
+				break;
 #else
-				starpu_pthread_mutex_t *sched_mutex;
-				starpu_pthread_cond_t *sched_cond;
-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
-
-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
-				    goto out; // wake up a single worker
+				dowake[worker] = 1;
 #endif
 			}
 	}
-out:
+	/* Let the task free */
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	/* Now that we have a list of potential workers, try to wake one */
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+	
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		if (dowake[worker])
+		{
+			starpu_pthread_mutex_t *sched_mutex;
+			starpu_pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+
+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
+				break; // wake up a single worker
+		}
+	}
+#endif
+
 	return 0;
 }
 
@@ -194,9 +212,11 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 	if (taskq->total_ntasks == 0)
 		return NULL;
 
+#ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (starpu_bitmap_get(data->waiters, workerid))
 		/* Nobody woke us, avoid bothering the mutex */
 		return NULL;
+#endif
 
 	/* release this mutex before trying to wake up other workers */
 	starpu_pthread_mutex_t *curr_sched_mutex;

+ 3 - 4
src/sched_policies/locality_work_stealing_policy.c

@@ -224,13 +224,12 @@ static int lws_push_task(struct starpu_task *task)
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_sched_ctx_iterator it;
+	unsigned worker;
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 	while(workers->has_next(workers, &it))
 	{
 		worker = workers->get_next(workers, &it);
-		starpu_pthread_mutex_t *sched_mutex;
-		starpu_pthread_cond_t *sched_cond;
 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
 	}
@@ -368,6 +367,6 @@ struct starpu_sched_policy _starpu_sched_lws_policy =
 	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
-	.policy_name = "nws",
-	.policy_description = "new work stealing"
+	.policy_name = "lws",
+	.policy_description = "locality work stealing"
 };

+ 1 - 0
tests/Makefile.am

@@ -151,6 +151,7 @@ noinst_PROGRAMS =				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
+	datawizard/cache			\
 	datawizard/commute			\
 	datawizard/copy				\
 	datawizard/data_implicit_deps		\

+ 100 - 0
tests/datawizard/cache.c

@@ -0,0 +1,100 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+static void codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+     FPRINTF(stderr, "%lx\n", (unsigned long) STARPU_VARIABLE_GET_PTR(descr[0]));
+     FPRINTF(stderr, "codelet\n");
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+static struct starpu_codelet cuda_cl =
+{
+     .cuda_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static struct starpu_codelet opencl_cl =
+{
+     .opencl_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+void dotest(struct starpu_codelet *cl)
+{
+     int ret;
+     int var = 42;
+     starpu_data_handle_t handle;
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+     starpu_data_unregister(handle);
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+enodev:
+     starpu_data_unregister(handle);
+}
+
+int main(int argc, char **argv)
+{
+     int ret;
+
+     ret = starpu_init(NULL);
+     if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+     dotest(&cuda_cl);
+#endif
+#ifdef STARPU_USE_OPENCL
+     dotest(&opencl_cl);
+#endif
+
+     starpu_shutdown();
+
+     return 0;
+
+enodev:
+     starpu_shutdown();
+     /* yes, we do not perform the computation but we did detect that no one
+      * could perform the kernel, so this is not an error from StarPU */
+     fprintf(stderr, "WARNING: No one can execute this task\n");
+     return STARPU_TEST_SKIPPED;
+}

+ 9 - 10
tests/datawizard/gpu_ptr_register.c

@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 find_a_worker(enum starpu_worker_archtype type)
 {
-	int worker;
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int worker[STARPU_NMAXWORKERS];
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 test_cuda(void)
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
 
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -180,9 +181,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -261,7 +260,7 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	if (ret == 1)
 		goto fail;

+ 10 - 11
tests/datawizard/gpu_register.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 find_a_worker(enum starpu_worker_archtype type)
 {
-	int worker;
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int worker[STARPU_NMAXWORKERS];
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 test_cuda(void)
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
 
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -182,9 +183,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
@@ -269,7 +268,7 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	if (ret == 1)
 		goto fail;

+ 11 - 0
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -205,7 +205,18 @@ run(struct starpu_sched_policy *policy)
 	if (cpu_task_worker != STARPU_CPU_WORKER ||
 			(gpu_task_worker != STARPU_CUDA_WORKER &&
 			 gpu_task_worker != STARPU_OPENCL_WORKER))
+	{
+		if (cpu_task_worker != STARPU_CPU_WORKER)
+		{
+			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
+		}
+		if (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER)
+		{
+			FPRINTF(stderr, "The GPU task did not run on a Cuda or OpenCL worker\n");
+		}
+
 		ret = 1;
+	}
 	else
 		ret = 0;