Browse Source

merge trunk

Samuel Thibault 10 years ago
parent
commit
8a1120b2d1
41 changed files with 540 additions and 252 deletions
  1. 2 1
      Makefile.am
  2. 2 0
      configure.ac
  3. 7 5
      doc/doxygen/chapters/13offline_performance_tools.doxy
  4. 2 2
      doc/doxygen/chapters/21simgrid.doxy
  5. 6 4
      doc/doxygen/chapters/api/data_interfaces.doxy
  6. 2 1
      examples/basic_examples/vector_scal_c.c
  7. 1 1
      examples/filters/custom_mf/custom_interface.c
  8. 6 0
      examples/lu/lu_example.c
  9. 1 6
      include/starpu.h
  10. 1 1
      include/starpu_data_interfaces.h
  11. 2 2
      include/starpu_opencl.h
  12. 28 0
      include/starpu_simgrid_wrap.h
  13. 17 17
      mpi/src/starpu_mpi.c
  14. 76 76
      mpi/src/starpu_mpi_fxt.h
  15. 6 5
      src/common/starpu_spinlock.c
  16. 1 2
      src/common/starpu_spinlock.h
  17. 21 19
      src/common/utils.c
  18. 2 2
      src/core/perfmodel/perfmodel_history.c
  19. 2 0
      src/core/sched_policy.c
  20. 23 2
      src/core/simgrid.c
  21. 11 3
      src/core/workers.c
  22. 3 0
      src/core/workers.h
  23. 42 23
      src/datawizard/coherency.c
  24. 11 6
      src/datawizard/data_request.c
  25. 18 2
      src/datawizard/malloc.c
  26. 10 5
      src/datawizard/memalloc.c
  27. 19 19
      src/debug/traces/starpu_fxt.c
  28. 1 1
      src/debug/traces/starpu_fxt_mpi.c
  29. 3 0
      src/drivers/cpu/driver_cpu.c
  30. 8 0
      src/drivers/cuda/driver_cuda.c
  31. 1 0
      src/drivers/gordon/driver_gordon.c
  32. 2 0
      src/drivers/mp_common/source_common.c
  33. 11 4
      src/drivers/opencl/driver_opencl.c
  34. 29 9
      src/sched_policies/eager_central_policy.c
  35. 29 9
      src/sched_policies/eager_central_priority_policy.c
  36. 3 4
      src/sched_policies/locality_work_stealing_policy.c
  37. 1 0
      tests/Makefile.am
  38. 100 0
      tests/datawizard/cache.c
  39. 9 10
      tests/datawizard/gpu_ptr_register.c
  40. 10 11
      tests/datawizard/gpu_register.c
  41. 11 0
      tests/sched_policies/simple_cpu_gpu_sched.c

+ 2 - 1
Makefile.am

@@ -85,7 +85,8 @@ versinclude_HEADERS = 				\
 	include/starpu_stdlib.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\
 	include/starpu_thread.h			\
 	include/starpu_thread_util.h		\
 	include/starpu_thread_util.h		\
-	include/starpu_tree.h
+	include/starpu_tree.h			\
+	include/starpu_simgrid_wrap.h
 
 
 nodist_versinclude_HEADERS = 			\
 nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h
 	include/starpu_config.h

+ 2 - 0
configure.ac

@@ -1820,6 +1820,8 @@ AC_SUBST(USE_MPI, $use_mpi)
 AM_CONDITIONAL(USE_MPI, test x$use_mpi = xyes)
 AM_CONDITIONAL(USE_MPI, test x$use_mpi = xyes)
 if test x$use_mpi = xyes; then
 if test x$use_mpi = xyes; then
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
+else
+	running_mpi_check=no
 fi
 fi
 
 
 AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
 AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],

+ 7 - 5
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -119,8 +119,9 @@ $ vite paje.trace
 To get names of tasks instead of "unknown", fill the optional
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
 starpu_codelet::name, or use a performance model for them.
 Details of the codelet execution can be obtained by passing
 Details of the codelet execution can be obtained by passing
-<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
-(at least r1430).
+\ref enable-paje-codelet-details "--enable-paje-codelet-details" when
+configuring StarPU and using a recent enough version of ViTE (at least
+r1430).
 
 
 In the MPI execution case, collect the trace files from the MPI nodes, and
 In the MPI execution case, collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
@@ -133,9 +134,10 @@ By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 
 
 To identify tasks precisely, the application can set the ::tag_id field of the
 To identify tasks precisely, the application can set the ::tag_id field of the
-tasks (or use STARPU_TAG_ONY when using starpu_task_insert), and with a recent
-enough version of vite (>= r1430) and the <c>--enable-paje-codelet-details</c>
-configure option, the value of the tag will show up in the trace.
+tasks (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
+enough version of vite (>= r1430) and the
+\ref enable-paje-codelet-details "--enable-paje-codelet-details"
+StarPU configure option, the value of the tag will show up in the trace.
 
 
 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
 
 

+ 2 - 2
doc/doxygen/chapters/21simgrid.doxy

@@ -22,9 +22,9 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 virtual timestamp in us.
 virtual timestamp in us.
 
 
 For some technical reason, the application's .c file which contains main() has
 For some technical reason, the application's .c file which contains main() has
-to be recompiled with starpu.h, which in the simgrid case will # define main()
+to be recompiled with starpu_simgrid_wrap.h, which in the simgrid case will # define main()
 into starpu_main(), and it is libstarpu which will provide the real main() and
 into starpu_main(), and it is libstarpu which will provide the real main() and
-call the application's main().
+will call the application's main().
 
 
 To be able to test with crazy data sizes, one may want to only allocate
 To be able to test with crazy data sizes, one may want to only allocate
 application data if STARPU_SIMGRID is not defined.  Passing a NULL pointer to
 application data if STARPU_SIMGRID is not defined.  Passing a NULL pointer to

+ 6 - 4
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -57,8 +57,9 @@ case of e.g. available particular CUDA or OpenCL support.
 \ingroup API_Data_Interfaces
 \ingroup API_Data_Interfaces
 \var starpu_data_copy_methods::can_copy
 \var starpu_data_copy_methods::can_copy
 If defined, allows the interface to declare whether it supports transferring
 If defined, allows the interface to declare whether it supports transferring
-from \p src_interface on node \p src_node to \p dst_interface on node \p. If not
-defined, it is assumed that the interface supports all transfers.
+from \p src_interface on node \p src_node to \p dst_interface on node \p
+dst_node, run from node \p handling_node. If not defined, it is assumed that the
+interface supports all transfers.
 \var starpu_data_copy_methods::ram_to_ram
 \var starpu_data_copy_methods::ram_to_ram
 Define how to copy data from the \p src_interface interface on the \p
 Define how to copy data from the \p src_interface interface on the \p
 src_node CPU node to the \p dst_interface interface on the \p dst_node
 src_node CPU node to the \p dst_interface interface on the \p dst_node
@@ -1000,11 +1001,12 @@ DefiningANewDataInterface.
 \ingroup API_Data_Interfaces
 \ingroup API_Data_Interfaces
 Allocate \p size bytes on node \p dst_node. This returns 0 if
 Allocate \p size bytes on node \p dst_node. This returns 0 if
 allocation failed, the allocation method should then return <c>-ENOMEM</c> as
 allocation failed, the allocation method should then return <c>-ENOMEM</c> as
-allocated size.
+allocated size. Deallocation must be done with starpu_free_on_node.
 
 
 \fn void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 \fn void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 \ingroup API_Data_Interfaces
 \ingroup API_Data_Interfaces
-Free \p addr of \p size bytes on node \p dst_node.
+Free \p addr of \p size bytes on node \p dst_node which was previously allocated
+with starpu_malloc_on_node.
 
 
 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
 \ingroup API_Data_Interfaces
 \ingroup API_Data_Interfaces

+ 2 - 1
examples/basic_examples/vector_scal_c.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013-2014  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
 	.cuda_funcs = {scal_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model = &vector_scal_model
 	.model = &vector_scal_model

+ 1 - 1
examples/filters/custom_mf/custom_interface.c

@@ -425,7 +425,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 	size = src_custom->nx * 2 * sizeof(float);
 	size = src_custom->nx * 2 * sizeof(float);
 	if (dst_custom->cpu_ptr == NULL)
 	if (dst_custom->cpu_ptr == NULL)
 	{
 	{
-		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
+		ret = starpu_opencl_allocate_memory(devid, (cl_mem*)&dst_custom->cpu_ptr,
 				size, CL_MEM_READ_WRITE);
 				size, CL_MEM_READ_WRITE);
 		assert(ret == CL_SUCCESS);
 		assert(ret == CL_SUCCESS);
 	}
 	}

+ 6 - 0
examples/lu/lu_example.c

@@ -58,6 +58,7 @@ static void parse_args(int argc, char **argv)
 			nblocks = strtol(argv[++i], &argptr, 10);
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
 		}
 
 
+#ifndef STARPU_SIMGRID
 		else if (strcmp(argv[i], "-check") == 0)
 		else if (strcmp(argv[i], "-check") == 0)
 		{
 		{
 			check = 1;
 			check = 1;
@@ -72,6 +73,7 @@ static void parse_args(int argc, char **argv)
 		{
 		{
 			no_stride = 1;
 			no_stride = 1;
 		}
 		}
+#endif
 
 
 		else if (strcmp(argv[i], "-profile") == 0)
 		else if (strcmp(argv[i], "-profile") == 0)
 		{
 		{
@@ -315,6 +317,7 @@ int main(int argc, char **argv)
 
 
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
+#ifndef STARPU_SIMGRID
 	init_matrix();
 	init_matrix();
 
 
 	unsigned *ipiv = NULL;
 	unsigned *ipiv = NULL;
@@ -364,6 +367,7 @@ int main(int argc, char **argv)
 		}
 		}
 	}
 	}
 	else
 	else
+#endif
 	{
 	{
 		ret = STARPU_LU(lu_decomposition)(A, size, size, nblocks);
 		ret = STARPU_LU(lu_decomposition)(A, size, size, nblocks);
 	}
 	}
@@ -403,6 +407,7 @@ int main(int argc, char **argv)
 		}
 		}
 	}
 	}
 
 
+#ifndef STARPU_SIMGRID
 	if (check)
 	if (check)
 	{
 	{
 		FPRINTF(stderr, "Checking result\n");
 		FPRINTF(stderr, "Checking result\n");
@@ -413,6 +418,7 @@ int main(int argc, char **argv)
 
 
 		check_result();
 		check_result();
 	}
 	}
+#endif
 
 
 	starpu_free(A);
 	starpu_free(A);
 
 

+ 1 - 6
include/starpu.h

@@ -66,18 +66,13 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_fxt.h>
 #include <starpu_fxt.h>
 #include <starpu_driver.h>
 #include <starpu_driver.h>
 #include <starpu_tree.h>
 #include <starpu_tree.h>
+#include <starpu_simgrid_wrap.h>
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C"
 extern "C"
 {
 {
 #endif
 #endif
 
 
-#ifdef STARPU_SIMGRID
-#ifndef main
-#define main starpu_main
-#endif
-#endif
-
 struct starpu_conf
 struct starpu_conf
 {
 {
 	int magic;
 	int magic;

+ 1 - 1
include/starpu_data_interfaces.h

@@ -37,7 +37,7 @@ extern "C"
 
 
 struct starpu_data_copy_methods
 struct starpu_data_copy_methods
 {
 {
-	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node);
 
 
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);

+ 2 - 2
include/starpu_opencl.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -76,7 +76,7 @@ int starpu_opencl_collect_stats(cl_event event);
 
 
 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
 
-cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *addr, size_t size, cl_mem_flags flags);
 
 
 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 
 

+ 28 - 0
include/starpu_simgrid_wrap.h

@@ -0,0 +1,28 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_SIMGRID_WRAP_H__
+#define __STARPU_SIMGRID_WRAP_H__
+
+#include <starpu_config.h>
+
+#ifdef STARPU_SIMGRID
+#ifndef main
+#define main starpu_main
+#endif
+#endif
+
+#endif /* __STARPU_SIMGRID_WRAP_H__ */

+ 17 - 17
mpi/src/starpu_mpi.c

@@ -173,12 +173,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
 
 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 
 
-	 TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
 
 
 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
 	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
 
 
-	 TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
+	 _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
 
 
 	 /* somebody is perhaps waiting for the MPI request to be posted */
 	 /* somebody is perhaps waiting for the MPI request to be posted */
 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -304,12 +304,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 
 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
 
-	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
 
 
 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
 
 
-	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
 
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -413,12 +413,12 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	/* Which is the mpi request we are waiting for ? */
 	/* Which is the mpi request we are waiting for ? */
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 
 
-	TRACE_MPI_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->srcdst, req->mpi_tag);
 
 
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %d", req->ret);
 
 
-	TRACE_MPI_UWAIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UWAIT_END(req->srcdst, req->mpi_tag);
 
 
 	_starpu_mpi_handle_request_termination(req);
 	_starpu_mpi_handle_request_termination(req);
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
@@ -481,12 +481,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
 
-	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
 
 
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
 
 
-	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_UTESTING_END(req->srcdst, req->mpi_tag);
 
 
 	if (*testing_req->flag)
 	if (*testing_req->flag)
 	{
 	{
@@ -933,22 +933,22 @@ static void _starpu_mpi_test_detached_requests(void)
 		{
 		{
 			if (req->request_type == RECV_REQ)
 			if (req->request_type == RECV_REQ)
 			{
 			{
-				TRACE_MPI_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(req->srcdst, req->mpi_tag);
 			}
 			}
 			else if (req->request_type == SEND_REQ)
 			else if (req->request_type == SEND_REQ)
 			{
 			{
-				TRACE_MPI_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(req->srcdst, req->mpi_tag, 0);
 			}
 			}
 
 
 			_starpu_mpi_handle_request_termination(req);
 			_starpu_mpi_handle_request_termination(req);
 
 
 			if (req->request_type == RECV_REQ)
 			if (req->request_type == RECV_REQ)
 			{
 			{
-				TRACE_MPI_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
+				_STARPU_MPI_TRACE_IRECV_COMPLETE_END(req->srcdst, req->mpi_tag);
 			}
 			}
 			else if (req->request_type == SEND_REQ)
 			else if (req->request_type == SEND_REQ)
 			{
 			{
-				TRACE_MPI_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
+				_STARPU_MPI_TRACE_ISEND_COMPLETE_END(req->srcdst, req->mpi_tag, 0);
 			}
 			}
 		}
 		}
 
 
@@ -1057,7 +1057,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
 
 
 	{
 	{
-		TRACE_MPI_START(rank, worldsize);
+		_STARPU_MPI_TRACE_START(rank, worldsize);
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 		starpu_profiling_set_id(rank);
 		starpu_profiling_set_id(rank);
 #endif //STARPU_USE_FXT
 #endif //STARPU_USE_FXT
@@ -1097,14 +1097,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		{
 		{
 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
 
 
-			TRACE_MPI_SLEEP_BEGIN();
+			_STARPU_MPI_TRACE_SLEEP_BEGIN();
 
 
 			if (barrier_running)
 			if (barrier_running)
 				/* Tell mpi_barrier */
 				/* Tell mpi_barrier */
 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 				STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 			STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 
 
-			TRACE_MPI_SLEEP_END();
+			_STARPU_MPI_TRACE_SLEEP_END();
 		}
 		}
 
 
 		/* get one request */
 		/* get one request */
@@ -1323,7 +1323,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
 
 
-	TRACE_MPI_BARRIER(rank, worldsize, random_number);
+	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 
 
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
 #endif
 #endif
@@ -1407,7 +1407,7 @@ int starpu_mpi_shutdown(void)
 	starpu_progression_hook_deregister(hookid);
 	starpu_progression_hook_deregister(hookid);
 #endif /* STARPU_MPI_ACTIVITY */
 #endif /* STARPU_MPI_ACTIVITY */
 
 
-	TRACE_MPI_STOP(rank, world_size);
+	_STARPU_MPI_TRACE_STOP(rank, world_size);
 
 
 	/* free the request queues */
 	/* free the request queues */
 	_starpu_mpi_req_list_delete(detached_requests);
 	_starpu_mpi_req_list_delete(detached_requests);

+ 76 - 76
mpi/src/starpu_mpi_fxt.h

@@ -26,86 +26,86 @@
 extern "C" {
 extern "C" {
 #endif
 #endif
 
 
-#define FUT_MPI_START				0x5201
-#define FUT_MPI_STOP				0x5202
-#define FUT_MPI_BARRIER				0x5203
-#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
-#define FUT_MPI_ISEND_SUBMIT_END		0x5205
-#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
-#define FUT_MPI_IRECV_SUBMIT_END		0x5207
-#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
-#define FUT_MPI_ISEND_COMPLETE_END		0x5209
-#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
-#define FUT_MPI_IRECV_COMPLETE_END		0x5211
-#define FUT_MPI_SLEEP_BEGIN			0x5212
-#define FUT_MPI_SLEEP_END			0x5213
-#define FUT_MPI_DTESTING_BEGIN			0x5214
-#define FUT_MPI_DTESTING_END			0x5215
-#define FUT_MPI_UTESTING_BEGIN			0x5216
-#define FUT_MPI_UTESTING_END			0x5217
-#define FUT_MPI_UWAIT_BEGIN			0x5218
-#define FUT_MPI_UWAIT_END			0x5219
+#define _STARPU_MPI_FUT_START				0x5201
+#define _STARPU_MPI_FUT_STOP				0x5202
+#define _STARPU_MPI_FUT_BARRIER				0x5203
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN		0x5204
+#define _STARPU_MPI_FUT_ISEND_SUBMIT_END		0x5205
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN		0x5206
+#define _STARPU_MPI_FUT_IRECV_SUBMIT_END		0x5207
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN		0x5208
+#define _STARPU_MPI_FUT_ISEND_COMPLETE_END		0x5209
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN		0x5210
+#define _STARPU_MPI_FUT_IRECV_COMPLETE_END		0x5211
+#define _STARPU_MPI_FUT_SLEEP_BEGIN			0x5212
+#define _STARPU_MPI_FUT_SLEEP_END			0x5213
+#define _STARPU_MPI_FUT_DTESTING_BEGIN			0x5214
+#define _STARPU_MPI_FUT_DTESTING_END			0x5215
+#define _STARPU_MPI_FUT_UTESTING_BEGIN			0x5216
+#define _STARPU_MPI_FUT_UTESTING_END			0x5217
+#define _STARPU_MPI_FUT_UWAIT_BEGIN			0x5218
+#define _STARPU_MPI_FUT_UWAIT_END			0x5219
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-#define TRACE_MPI_START(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_STOP(rank, worldsize)	\
-	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
-	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_SLEEP_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
-#define TRACE_MPI_SLEEP_END()	\
-	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
-#define TRACE_MPI_DTESTING_BEGIN()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
-#define TRACE_MPI_DTESTING_END()	\
-	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
-#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
-	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_START(rank, worldsize)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
+#define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
+#define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
+#define _STARPU_MPI_TRACE_SLEEP_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
+#define _STARPU_MPI_TRACE_DTESTING_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define _STARPU_MPI_TRACE_UTESTING_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
 #define TRACE
 #define TRACE
 #else
 #else
-#define TRACE_MPI_START(a, b)				do {} while(0);
-#define TRACE_MPI_STOP(a, b)				do {} while(0);
-#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
-#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
-#define TRACE_MPI_SLEEP_END()				do {} while(0);
-#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
-#define TRACE_MPI_DTESTING_END()			do {} while(0);
-#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()				do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_END()				do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()			do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_END()			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
 #endif
 #endif
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 6 - 5
src/common/starpu_spinlock.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,15 +24,17 @@
 int _starpu_spin_init(struct _starpu_spinlock *lock)
 int _starpu_spin_init(struct _starpu_spinlock *lock)
 {
 {
 #if defined(STARPU_SPINLOCK_CHECK)
 #if defined(STARPU_SPINLOCK_CHECK)
+	starpu_pthread_mutexattr_t errcheck_attr;
 //	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
 //	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
 	int ret;
 	int ret;
-	ret = starpu_pthread_mutexattr_init(&lock->errcheck_attr);
+	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
 	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
 
 
-	ret = starpu_pthread_mutexattr_settype(&lock->errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
+	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
 	STARPU_ASSERT(!ret);
 	STARPU_ASSERT(!ret);
 
 
-	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &lock->errcheck_attr);
+	ret = starpu_pthread_mutex_init(&lock->errcheck_lock, &errcheck_attr);
+	starpu_pthread_mutexattr_destroy(&errcheck_attr);
 	return ret;
 	return ret;
 #else
 #else
 	int ret = starpu_pthread_spin_init(&lock->lock, 0);
 	int ret = starpu_pthread_spin_init(&lock->lock, 0);
@@ -44,7 +46,6 @@ int _starpu_spin_init(struct _starpu_spinlock *lock)
 int _starpu_spin_destroy(struct _starpu_spinlock *lock STARPU_ATTRIBUTE_UNUSED)
 int _starpu_spin_destroy(struct _starpu_spinlock *lock STARPU_ATTRIBUTE_UNUSED)
 {
 {
 #if defined(STARPU_SPINLOCK_CHECK)
 #if defined(STARPU_SPINLOCK_CHECK)
-	starpu_pthread_mutexattr_destroy(&lock->errcheck_attr);
 	return starpu_pthread_mutex_destroy(&lock->errcheck_lock);
 	return starpu_pthread_mutex_destroy(&lock->errcheck_lock);
 #else
 #else
 	return starpu_pthread_spin_destroy(&lock->lock);
 	return starpu_pthread_spin_destroy(&lock->lock);

+ 1 - 2
src/common/starpu_spinlock.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,7 +25,6 @@
 struct _starpu_spinlock
 struct _starpu_spinlock
 {
 {
 #if defined(STARPU_SPINLOCK_CHECK)
 #if defined(STARPU_SPINLOCK_CHECK)
-	starpu_pthread_mutexattr_t errcheck_attr;
 	starpu_pthread_mutex_t errcheck_lock;
 	starpu_pthread_mutex_t errcheck_lock;
 	const char *last_taker;
 	const char *last_taker;
 #else
 #else

+ 21 - 19
src/common/utils.c

@@ -83,10 +83,24 @@ int _starpu_mkpath(const char *s, mode_t mode)
 	if ((_starpu_mkpath(up, mode) == -1) && (errno != EEXIST))
 	if ((_starpu_mkpath(up, mode) == -1) && (errno != EEXIST))
 		goto out;
 		goto out;
 
 
-	if ((mkdir(path, mode) == -1) && (errno != EEXIST))
-		rv = -1;
-	else
+	struct stat sb;
+	if (stat(path, &sb) == 0)
+	{
+		if (!S_ISDIR(sb.st_mode))
+		{
+			fprintf(stderr,"Error: %s is not a directory:\n", path);
+			STARPU_ABORT();
+		}
+		/* It already exists and is a directory.  */
 		rv = 0;
 		rv = 0;
+	}
+	else
+	{
+		if ((mkdir(path, mode) == -1) && (errno != EEXIST))
+			rv = -1;
+		else
+			rv = 0;
+	}
 
 
 out:
 out:
 	olderrno = errno;
 	olderrno = errno;
@@ -105,23 +119,11 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 
 
 	ret = _starpu_mkpath(path, mode);
 	ret = _starpu_mkpath(path, mode);
 
 
-	if (ret == -1)
+	if (ret == -1 && errno != EEXIST)
 	{
 	{
-		if (errno != EEXIST)
-		{
-			fprintf(stderr,"Error making StarPU directory %s:\n", path);
-			perror("mkdir");
-			STARPU_ABORT();
-		}
-
-		/* make sure that it is actually a directory */
-		struct stat sb;
-		stat(path, &sb);
-		if (!S_ISDIR(sb.st_mode))
-		{
-			fprintf(stderr,"Error: %s is not a directory:\n", path);
-			STARPU_ABORT();
-		}
+		fprintf(stderr,"Error making StarPU directory %s:\n", path);
+		perror("mkdir");
+		STARPU_ABORT();
 	}
 	}
 }
 }
 
 

+ 2 - 2
src/core/perfmodel/perfmodel_history.c

@@ -1230,7 +1230,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 			char archname[32];
 			char archname[32];
 
 
 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-			_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry && entry->history_entry ? entry->history_entry->nsample : 0);
 			_starpu_set_calibrate_flag(1);
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 			model->benchmarking = 1;
 		}
 		}
@@ -1272,7 +1272,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 		char archname[32];
 		char archname[32];
 
 
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry ? entry->nsample : 0);
 		_starpu_set_calibrate_flag(1);
 		_starpu_set_calibrate_flag(1);
 		model->benchmarking = 1;
 		model->benchmarking = 1;
 	}
 	}

+ 2 - 0
src/core/sched_policy.c

@@ -841,6 +841,7 @@ pick:
 	 * We do have a task that uses multiformat handles. Let's create the
 	 * We do have a task that uses multiformat handles. Let's create the
 	 * required conversion tasks.
 	 * required conversion tasks.
 	 */
 	 */
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 	unsigned i;
 	unsigned i;
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	for (i = 0; i < nbuffers; i++)
 	for (i = 0; i < nbuffers; i++)
@@ -864,6 +865,7 @@ pick:
 
 
 	task->mf_skip = 1;
 	task->mf_skip = 1;
 	starpu_task_list_push_back(&worker->local_tasks, task);
 	starpu_task_list_push_back(&worker->local_tasks, task);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 	goto pick;
 	goto pick;
 
 
 profiling:
 profiling:

+ 23 - 2
src/core/simgrid.c

@@ -27,6 +27,7 @@
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 #include <msg/msg.h>
 #include <msg/msg.h>
 #include <smpi/smpif.h>
 #include <smpi/smpif.h>
+#include <sys/resource.h>
 
 
 #define STARPU_MPI_AS_PREFIX "StarPU-MPI"
 #define STARPU_MPI_AS_PREFIX "StarPU-MPI"
 
 
@@ -160,7 +161,7 @@ int main(int argc, char **argv)
 
 
 	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
 	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
 	{
 	{
-		_STARPU_ERROR("The main file of this application needs to be compiled with starpu.h included, to properly define starpu_main\n");
+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
 		exit(EXIT_FAILURE);
 		exit(EXIT_FAILURE);
 	}
 	}
 
 
@@ -178,7 +179,12 @@ int main(int argc, char **argv)
 #endif
 #endif
 	/* Simgrid uses tiny stacks by default.  This comes unexpected to our users.  */
 	/* Simgrid uses tiny stacks by default.  This comes unexpected to our users.  */
 	extern xbt_cfg_t _sg_cfg_set;
 	extern xbt_cfg_t _sg_cfg_set;
-	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", 8192);
+	unsigned stack_size = 8192;
+	struct rlimit rlim;
+	if (getrlimit(RLIMIT_STACK, &rlim) == 0 && rlim.rlim_cur != 0 && rlim.rlim_cur != RLIM_INFINITY)
+		stack_size = rlim.rlim_cur / 1024;
+
+	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", stack_size);
 
 
 	/* Load XML platform */
 	/* Load XML platform */
 	_starpu_simgrid_get_platform_path(path, sizeof(path));
 	_starpu_simgrid_get_platform_path(path, sizeof(path));
@@ -196,6 +202,12 @@ void _starpu_simgrid_init()
 	xbt_dynar_t hosts;
 	xbt_dynar_t hosts;
 	int i;
 	int i;
 
 
+	if (!starpu_main && !(smpi_main && smpi_simulated_main_))
+	{
+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h included, to properly rename it into starpu_main\n");
+		exit(EXIT_FAILURE);
+	}
+
 #ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 #ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	if (_starpu_simgrid_running_smpi())
 	{
 	{
@@ -253,6 +265,10 @@ void _starpu_simgrid_init()
 	xbt_dynar_free(&hosts);
 	xbt_dynar_free(&hosts);
 }
 }
 
 
+/*
+ * Tasks
+ */
+
 /* Task execution submitted by StarPU */
 /* Task execution submitted by StarPU */
 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
 void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length)
 {
 {
@@ -276,8 +292,13 @@ void _starpu_simgrid_execute_job(struct _starpu_job *j, struct starpu_perfmodel_
 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
 			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
 			0, NULL);
 			0, NULL);
 	MSG_task_execute(simgrid_task);
 	MSG_task_execute(simgrid_task);
+	MSG_task_destroy(simgrid_task);
 }
 }
 
 
+/*
+ * Transfers
+ */
+
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 LIST_TYPE(transfer,
 LIST_TYPE(transfer,
 	msg_task_t task;
 	msg_task_t task;

+ 11 - 3
src/core/workers.c

@@ -1218,16 +1218,17 @@ out:
 			STARPU_ASSERT(worker->local_ordered_tasks[n] == NULL);
 			STARPU_ASSERT(worker->local_ordered_tasks[n] == NULL);
 		_starpu_sched_ctx_list_delete(&worker->sched_ctx_list);
 		_starpu_sched_ctx_list_delete(&worker->sched_ctx_list);
 		_starpu_job_list_delete(worker->terminated_jobs);
 		_starpu_job_list_delete(worker->terminated_jobs);
+		free(worker->local_ordered_tasks);
 	}
 	}
 }
 }
 
 
 /* Condition variable and mutex used to pause/resume. */
 /* Condition variable and mutex used to pause/resume. */
 static starpu_pthread_cond_t pause_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_cond_t pause_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t pause_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_mutex_t pause_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-unsigned _starpu_machine_is_running(void)
+
+void _starpu_may_pause(void)
 {
 {
-	unsigned ret;
-	/* running and pause_depth are just protected by a memory barrier */
+	/* pause_depth is just protected by a memory barrier */
 	STARPU_RMB();
 	STARPU_RMB();
 
 
 	if (STARPU_UNLIKELY(config.pause_depth > 0)) {
 	if (STARPU_UNLIKELY(config.pause_depth > 0)) {
@@ -1237,6 +1238,13 @@ unsigned _starpu_machine_is_running(void)
 		}
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&pause_mutex);
 	}
 	}
+}
+
+unsigned _starpu_machine_is_running(void)
+{
+	unsigned ret;
+	/* running is just protected by a memory barrier */
+	STARPU_RMB();
 
 
 	ANNOTATE_HAPPENS_AFTER(&config.running);
 	ANNOTATE_HAPPENS_AFTER(&config.running);
 	ret = config.running;
 	ret = config.running;

+ 3 - 0
src/core/workers.h

@@ -354,6 +354,9 @@ char ***_starpu_get_argv();
 /* Fill conf with environment variables */
 /* Fill conf with environment variables */
 void _starpu_conf_check_environment(struct starpu_conf *conf);
 void _starpu_conf_check_environment(struct starpu_conf *conf);
 
 
+/* Called by the driver when it is ready to pause  */
+void _starpu_may_pause(void);
+
 /* Has starpu_shutdown already been called ? */
 /* Has starpu_shutdown already been called ? */
 unsigned _starpu_machine_is_running(void);
 unsigned _starpu_machine_is_running(void);
 
 

+ 42 - 23
src/datawizard/coherency.c

@@ -41,8 +41,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	double cost = INFINITY;
 	double cost = INFINITY;
 	unsigned src_node_mask = 0;
 	unsigned src_node_mask = 0;
 
 
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
 	for (node = 0; node < nnodes; node++)
 	for (node = 0; node < nnodes; node++)
 	{
 	{
 		if (handle->per_node[node].state != STARPU_INVALID)
 		if (handle->per_node[node].state != STARPU_INVALID)
@@ -74,15 +72,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 				double time = starpu_transfer_predict(i, destination, size);
 				double time = starpu_transfer_predict(i, destination, size);
 				unsigned handling_node;
 				unsigned handling_node;
 
 
-				/* Avoid transfers which the interface does not want */
-				if (copy_methods->can_copy)
-				{
-					void *src_interface = handle->per_node[i].data_interface;
-					void *dst_interface = handle->per_node[destination].data_interface;
-					if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
-						continue;
-				}
-
 				/* Avoid indirect transfers */
 				/* Avoid indirect transfers */
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 					continue;
 					continue;
@@ -115,22 +104,22 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 		
 		
 		if (src_node_mask & (1<<i))
 		if (src_node_mask & (1<<i))
 		{
 		{
+			int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
 			/* Avoid transfers which the interface does not want */
 			/* Avoid transfers which the interface does not want */
-			if (copy_methods->can_copy)
+			if (can_copy)
 			{
 			{
 				void *src_interface = handle->per_node[i].data_interface;
 				void *src_interface = handle->per_node[i].data_interface;
 				void *dst_interface = handle->per_node[destination].data_interface;
 				void *dst_interface = handle->per_node[destination].data_interface;
 				unsigned handling_node;
 				unsigned handling_node;
 
 
-				if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
-					continue;
-
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 				{
 				{
 					/* Avoid through RAM if the interface does not want it */
 					/* Avoid through RAM if the interface does not want it */
 					void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
 					void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
-					if (!copy_methods->can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM)
-					 || !copy_methods->can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination))
+					if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
+					  && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
+					 || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
+					  && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
 						continue;
 						continue;
 				}
 				}
 			}
 			}
@@ -251,7 +240,9 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
 
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
 {
 {
-	(void) handle; // unused
+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
+	void *src_interface = handle->per_node[src_node].data_interface;
+	void *dst_interface = handle->per_node[dst_node].data_interface;
 
 
 	/* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
 	/* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
@@ -260,19 +251,19 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 	{
 	{
 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-		if (!copy_methods->cuda_to_cuda_async)
+		if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
 			return 0;
 			return 0;
 	}
 	}
 #endif
 #endif
 
 
 	/* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
 	/* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
-	if (worker_supports_direct_access(src_node, dst_node))
+	if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
 	{
 	{
 		*handling_node = dst_node;
 		*handling_node = dst_node;
 		return 1;
 		return 1;
 	}
 	}
 
 
-	if (worker_supports_direct_access(dst_node, src_node))
+	if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
 	{
 	{
 		*handling_node = src_node;
 		*handling_node = src_node;
 		return 1;
 		return 1;
@@ -319,6 +310,10 @@ static int determine_request_path(starpu_data_handle_t handle,
 
 
 	if (!link_is_valid)
 	if (!link_is_valid)
 	{
 	{
+		int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
+		void *src_interface = handle->per_node[src_node].data_interface;
+		void *dst_interface = handle->per_node[dst_node].data_interface;
+
 		/* We need an intermediate hop to implement data staging
 		/* We need an intermediate hop to implement data staging
 		 * through main memory. */
 		 * through main memory. */
 		STARPU_ASSERT(max_len >= 2);
 		STARPU_ASSERT(max_len >= 2);
@@ -326,12 +321,36 @@ static int determine_request_path(starpu_data_handle_t handle,
 		/* GPU -> RAM */
 		/* GPU -> RAM */
 		src_nodes[0] = src_node;
 		src_nodes[0] = src_node;
 		dst_nodes[0] = STARPU_MAIN_RAM;
 		dst_nodes[0] = STARPU_MAIN_RAM;
-		handling_nodes[0] = starpu_node_get_kind(src_node) == STARPU_DISK_RAM ? dst_node : src_node;
+
+		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
+			/* Disks don't have their own driver thread */
+			handling_nodes[0] = dst_node;
+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
+		{
+			handling_nodes[0] = src_node;
+		}
+		else
+		{
+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
+			handling_nodes[0] = dst_node;
+		}
 
 
 		/* RAM -> GPU */
 		/* RAM -> GPU */
 		src_nodes[1] = STARPU_MAIN_RAM;
 		src_nodes[1] = STARPU_MAIN_RAM;
 		dst_nodes[1] = dst_node;
 		dst_nodes[1] = dst_node;
-		handling_nodes[1] = starpu_node_get_kind(dst_node) == STARPU_DISK_RAM ? src_node : dst_node;
+
+		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
+			/* Disks don't have their own driver thread */
+			handling_nodes[1] = src_node;
+		else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
+		{
+			handling_nodes[1] = dst_node;
+		}
+		else
+		{
+			STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
+			handling_nodes[1] = src_node;
+		}
 
 
 		return 2;
 		return 2;
 	}
 	}

+ 11 - 6
src/datawizard/data_request.c

@@ -155,17 +155,22 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 {
 {
 	int retval;
 	int retval;
 	int do_delete = 0;
 	int do_delete = 0;
+	int completed;
 
 
 	unsigned local_node = _starpu_memory_node_get_local_key();
 	unsigned local_node = _starpu_memory_node_get_local_key();
 
 
 	do
 	do
 	{
 	{
-		_starpu_spin_lock(&r->lock);
-
-		if (r->completed)
-			break;
-
-		_starpu_spin_unlock(&r->lock);
+		STARPU_HG_DISABLE_CHECKING(r->completed);
+		completed = r->completed;
+		STARPU_HG_ENABLE_CHECKING(r->completed);
+		if (completed)
+		{
+			_starpu_spin_lock(&r->lock);
+			if (r->completed)
+				break;
+			_starpu_spin_unlock(&r->lock);
+		}
 
 
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 		_starpu_wake_all_blocked_workers_on_node(r->handling_node);
 		_starpu_wake_all_blocked_workers_on_node(r->handling_node);

+ 18 - 2
src/datawizard/malloc.c

@@ -50,7 +50,7 @@ struct malloc_pinned_codelet_struct
 //{
 //{
 //	struct malloc_pinned_codelet_struct *s = arg;
 //	struct malloc_pinned_codelet_struct *s = arg;
 //        //        *(s->ptr) = malloc(s->dim);
 //        //        *(s->ptr) = malloc(s->dim);
-//        starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
+//        starpu_opencl_allocate_memory(devid, (void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
 //}
 //}
 //#endif
 //#endif
 
 
@@ -404,6 +404,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 			STARPU_ASSERT(last[dst_node] >= addr);
 			STARPU_ASSERT(last[dst_node] >= addr);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 #else
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			status = cudaMalloc((void **)&addr, size);
 			status = cudaMalloc((void **)&addr, size);
 			if (!addr || (status != cudaSuccess))
 			if (!addr || (status != cudaSuccess))
 			{
 			{
@@ -433,7 +441,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
                                 int ret;
                                 int ret;
 				cl_mem ptr;
 				cl_mem ptr;
 
 
-				ret = starpu_opencl_allocate_memory(&ptr, size, CL_MEM_READ_WRITE);
+				ret = starpu_opencl_allocate_memory(_starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
 				if (ret)
 				if (ret)
 				{
 				{
 					addr = 0;
 					addr = 0;
@@ -505,6 +513,14 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 #else
 			cudaError_t err;
 			cudaError_t err;
+			struct _starpu_worker *worker = _starpu_get_local_worker_key();
+			unsigned devid = _starpu_memory_node_get_devid(dst_node);
+			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+#if defined(HAVE_CUDA_MEMCPY_PEER)
+				starpu_cuda_set_device(devid);
+#else
+				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
 			err = cudaFree((void*)addr);
 			err = cudaFree((void*)addr);
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(err);
 				STARPU_CUDA_REPORT_ERROR(err);

+ 10 - 5
src/datawizard/memalloc.c

@@ -440,19 +440,24 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
 
 	if (!old_replicate)
 	if (!old_replicate)
 	{
 	{
+		/* Free the copy that we made */
 		free(mc->chunk_interface);
 		free(mc->chunk_interface);
 		mc->chunk_interface = NULL;
 		mc->chunk_interface = NULL;
 	}
 	}
 
 
-	mc->data = new_replicate->handle;
-	/* mc->ops, mc->footprint and mc->interface should be
+	/* XXX: We do not actually reuse the mc at the moment, only the interface */
+
+	/* mc->data = new_replicate->handle; */
+	/* mc->footprint, mc->ops, mc->size_interface, mc->automatically_allocated should be
  	 * unchanged ! */
  	 * unchanged ! */
 
 
-	/* reinsert the mem chunk in the list of active memory chunks */
-	if (!is_already_in_mc_list)
+	/* remove the mem chunk from the list of active memory chunks, register_mem_chunk will put it back later */
+	if (is_already_in_mc_list)
 	{
 	{
-		_starpu_mem_chunk_list_push_back(mc_list[node], mc);
+		_starpu_mem_chunk_list_erase(mc_list[node], mc);
 	}
 	}
+
+	free(mc);
 }
 }
 
 
 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
 static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)

+ 19 - 19
src/debug/traces/starpu_fxt.c

@@ -1893,79 +1893,79 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_user_event(&ev, options);
 				handle_user_event(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_START:
+			case _STARPU_MPI_FUT_START:
 				handle_mpi_start(&ev, options);
 				handle_mpi_start(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_STOP:
+			case _STARPU_MPI_FUT_STOP:
 				handle_mpi_stop(&ev, options);
 				handle_mpi_stop(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_BARRIER:
+			case _STARPU_MPI_FUT_BARRIER:
 				handle_mpi_barrier(&ev, options);
 				handle_mpi_barrier(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN:
 				handle_mpi_isend_submit_begin(&ev, options);
 				handle_mpi_isend_submit_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_SUBMIT_END:
+			case _STARPU_MPI_FUT_ISEND_SUBMIT_END:
 				handle_mpi_isend_submit_end(&ev, options);
 				handle_mpi_isend_submit_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_SUBMIT_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN:
 				handle_mpi_irecv_submit_begin(&ev, options);
 				handle_mpi_irecv_submit_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_SUBMIT_END:
+			case _STARPU_MPI_FUT_IRECV_SUBMIT_END:
 				handle_mpi_irecv_submit_end(&ev, options);
 				handle_mpi_irecv_submit_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN:
 				handle_mpi_isend_complete_begin(&ev, options);
 				handle_mpi_isend_complete_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_ISEND_COMPLETE_END:
+			case _STARPU_MPI_FUT_ISEND_COMPLETE_END:
 				handle_mpi_isend_complete_end(&ev, options);
 				handle_mpi_isend_complete_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_COMPLETE_BEGIN:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN:
 				handle_mpi_irecv_complete_begin(&ev, options);
 				handle_mpi_irecv_complete_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_IRECV_COMPLETE_END:
+			case _STARPU_MPI_FUT_IRECV_COMPLETE_END:
 				handle_mpi_irecv_complete_end(&ev, options);
 				handle_mpi_irecv_complete_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_SLEEP_BEGIN:
+			case _STARPU_MPI_FUT_SLEEP_BEGIN:
 				handle_mpi_sleep_begin(&ev, options);
 				handle_mpi_sleep_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_SLEEP_END:
+			case _STARPU_MPI_FUT_SLEEP_END:
 				handle_mpi_sleep_end(&ev, options);
 				handle_mpi_sleep_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_DTESTING_BEGIN:
+			case _STARPU_MPI_FUT_DTESTING_BEGIN:
 				handle_mpi_dtesting_begin(&ev, options);
 				handle_mpi_dtesting_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_DTESTING_END:
+			case _STARPU_MPI_FUT_DTESTING_END:
 				handle_mpi_dtesting_end(&ev, options);
 				handle_mpi_dtesting_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UTESTING_BEGIN:
+			case _STARPU_MPI_FUT_UTESTING_BEGIN:
 				handle_mpi_utesting_begin(&ev, options);
 				handle_mpi_utesting_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UTESTING_END:
+			case _STARPU_MPI_FUT_UTESTING_END:
 				handle_mpi_utesting_end(&ev, options);
 				handle_mpi_utesting_end(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UWAIT_BEGIN:
+			case _STARPU_MPI_FUT_UWAIT_BEGIN:
 				handle_mpi_uwait_begin(&ev, options);
 				handle_mpi_uwait_begin(&ev, options);
 				break;
 				break;
 
 
-			case FUT_MPI_UWAIT_END:
+			case _STARPU_MPI_FUT_UWAIT_END:
 				handle_mpi_uwait_end(&ev, options);
 				handle_mpi_uwait_end(&ev, options);
 				break;
 				break;
 
 

+ 1 - 1
src/debug/traces/starpu_fxt_mpi.c

@@ -74,7 +74,7 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 			break;
 			break;
 		}
 		}
 
 
-		if (ev.code == FUT_MPI_BARRIER)
+		if (ev.code == _STARPU_MPI_FUT_BARRIER)
 		{
 		{
 			/* We found the sync point */
 			/* We found the sync point */
 			*offset = ev.time;
 			*offset = ev.time;

+ 3 - 0
src/drivers/cpu/driver_cpu.c

@@ -301,7 +301,10 @@ _starpu_cpu_worker(void *arg)
 
 
 	_starpu_cpu_driver_init(args);
 	_starpu_cpu_driver_init(args);
 	while (_starpu_machine_is_running())
 	while (_starpu_machine_is_running())
+	{
+		_starpu_may_pause();
 		_starpu_cpu_driver_run_once(args);
 		_starpu_cpu_driver_run_once(args);
+	}
 	_starpu_cpu_driver_deinit(args);
 	_starpu_cpu_driver_deinit(args);
 
 
 	return NULL;
 	return NULL;

+ 8 - 0
src/drivers/cuda/driver_cuda.c

@@ -567,6 +567,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
 		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
 
 
 		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
 		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
+		if (worker->pipeline_length > STARPU_MAX_PIPELINE)
+		{
+			_STARPU_DISP("Warning: STARPU_CUDA_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
+			worker->pipeline_length = STARPU_MAX_PIPELINE;
+		}
 		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
 		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
 	}
 	}
 
 
@@ -752,7 +757,10 @@ void *_starpu_cuda_worker(void *_arg)
 	_starpu_cuda_driver_init(worker);
 	_starpu_cuda_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
 	while (_starpu_machine_is_running())
+	{
+		_starpu_may_pause();
 		_starpu_cuda_driver_run_once(worker);
 		_starpu_cuda_driver_run_once(worker);
+	}
 	_STARPU_TRACE_END_PROGRESS(memnode);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 	_starpu_cuda_driver_deinit(worker);
 	_starpu_cuda_driver_deinit(worker);
 
 

+ 1 - 0
src/drivers/gordon/driver_gordon.c

@@ -343,6 +343,7 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 
 
 	while(_starpu_machine_is_running())
 	while(_starpu_machine_is_running())
 	{
 	{
+		_starpu_may_pause();
 		if (gordon_busy_enough())
 		if (gordon_busy_enough())
 		{
 		{
 			/* gordon already has enough work, wait a little TODO */
 			/* gordon already has enough work, wait a little TODO */

+ 2 - 0
src/drivers/mp_common/source_common.c

@@ -683,6 +683,8 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
 		int res;
 		int res;
 		struct _starpu_job * j;
 		struct _starpu_job * j;
 
 
+		_starpu_may_pause();
+
 		_STARPU_TRACE_START_PROGRESS(memnode);
 		_STARPU_TRACE_START_PROGRESS(memnode);
 		_starpu_datawizard_progress(memnode, 1);
 		_starpu_datawizard_progress(memnode, 1);
 		_STARPU_TRACE_END_PROGRESS(memnode);
 		_STARPU_TRACE_END_PROGRESS(memnode);

+ 11 - 4
src/drivers/opencl/driver_opencl.c

@@ -214,16 +214,15 @@ cl_int _starpu_opencl_deinit_context(int devid)
 }
 }
 #endif
 #endif
 
 
-cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
+cl_int starpu_opencl_allocate_memory(int devid, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
 {
 {
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	STARPU_ABORT();
 	STARPU_ABORT();
 #else
 #else
 	cl_int err;
 	cl_int err;
         cl_mem memory;
         cl_mem memory;
-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
 
-	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
@@ -234,7 +233,7 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t
 	 */
 	 */
 	char dummy = 0;
 	char dummy = 0;
 	cl_event ev;
 	cl_event ev;
-	err = clEnqueueWriteBuffer(alloc_queues[worker->devid], memory, CL_TRUE,
+	err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
 				   0, sizeof(dummy), &dummy,
 				   0, sizeof(dummy), &dummy,
 				   0, NULL, &ev);
 				   0, NULL, &ev);
 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
@@ -597,6 +596,11 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
 
 
 	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
 	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
+	if (worker->pipeline_length > STARPU_MAX_PIPELINE)
+	{
+		_STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
+		worker->pipeline_length = STARPU_MAX_PIPELINE;
+	}
 
 
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
 
 
@@ -741,7 +745,10 @@ void *_starpu_opencl_worker(void *_arg)
 	_starpu_opencl_driver_init(worker);
 	_starpu_opencl_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
 	while (_starpu_machine_is_running())
+	{
+		_starpu_may_pause();
 		_starpu_opencl_driver_run_once(worker);
 		_starpu_opencl_driver_run_once(worker);
+	}
 	_starpu_opencl_driver_deinit(worker);
 	_starpu_opencl_driver_deinit(worker);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
 

+ 29 - 9
src/sched_policies/eager_central_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
@@ -91,6 +91,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	
 	
 	struct starpu_sched_ctx_iterator it;
 	struct starpu_sched_ctx_iterator it;
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	char dowake[STARPU_NMAXWORKERS] = { 0 };
+#endif
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 	
 	
@@ -112,20 +115,35 @@ static int push_task_eager_policy(struct starpu_task *task)
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 				starpu_bitmap_unset(data->waiters, worker);
 				starpu_bitmap_unset(data->waiters, worker);
 				/* We really woke at least somebody, no need to wake somebody else */
 				/* We really woke at least somebody, no need to wake somebody else */
-				goto out;
+				break;
 #else
 #else
-				starpu_pthread_mutex_t *sched_mutex;
-				starpu_pthread_cond_t *sched_cond;
-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
-
-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
-				    goto out; // wake up a single worker
+				dowake[worker] = 1;
 #endif
 #endif
 			}
 			}
 	}
 	}
-out:
+	/* Let the task free */
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	/* Now that we have a list of potential workers, try to wake one */
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+	
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		if (dowake[worker])
+		{
+			starpu_pthread_mutex_t *sched_mutex;
+			starpu_pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+
+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
+				break; // wake up a single worker
+		}
+	}
+#endif
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -154,9 +172,11 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 	if (_starpu_fifo_empty(data->fifo))
 	if (_starpu_fifo_empty(data->fifo))
 		return NULL;
 		return NULL;
 
 
+#ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (starpu_bitmap_get(data->waiters, workerid))
 	if (starpu_bitmap_get(data->waiters, workerid))
 		/* Nobody woke us, avoid bothering the mutex */
 		/* Nobody woke us, avoid bothering the mutex */
 		return NULL;
 		return NULL;
+#endif
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 
 

+ 29 - 9
src/sched_policies/eager_central_priority_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
@@ -139,6 +139,9 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	
 	
 	struct starpu_sched_ctx_iterator it;
 	struct starpu_sched_ctx_iterator it;
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	char dowake[STARPU_NMAXWORKERS] = { 0 };
+#endif
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 	
 	
@@ -160,20 +163,35 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 				starpu_bitmap_unset(data->waiters, worker);
 				starpu_bitmap_unset(data->waiters, worker);
 				/* We really woke at least somebody, no need to wake somebody else */
 				/* We really woke at least somebody, no need to wake somebody else */
-				goto out;
+				break;
 #else
 #else
-				starpu_pthread_mutex_t *sched_mutex;
-				starpu_pthread_cond_t *sched_cond;
-				starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
-
-				if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
-				    goto out; // wake up a single worker
+				dowake[worker] = 1;
 #endif
 #endif
 			}
 			}
 	}
 	}
-out:
+	/* Let the task free */
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	/* Now that we have a list of potential workers, try to wake one */
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+	
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		if (dowake[worker])
+		{
+			starpu_pthread_mutex_t *sched_mutex;
+			starpu_pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+
+			if (starpu_wakeup_worker(worker, sched_cond, sched_mutex))
+				break; // wake up a single worker
+		}
+	}
+#endif
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -194,9 +212,11 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 	if (taskq->total_ntasks == 0)
 	if (taskq->total_ntasks == 0)
 		return NULL;
 		return NULL;
 
 
+#ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (starpu_bitmap_get(data->waiters, workerid))
 	if (starpu_bitmap_get(data->waiters, workerid))
 		/* Nobody woke us, avoid bothering the mutex */
 		/* Nobody woke us, avoid bothering the mutex */
 		return NULL;
 		return NULL;
+#endif
 
 
 	/* release this mutex before trying to wake up other workers */
 	/* release this mutex before trying to wake up other workers */
 	starpu_pthread_mutex_t *curr_sched_mutex;
 	starpu_pthread_mutex_t *curr_sched_mutex;

+ 3 - 4
src/sched_policies/locality_work_stealing_policy.c

@@ -224,13 +224,12 @@ static int lws_push_task(struct starpu_task *task)
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_sched_ctx_iterator it;
 	struct starpu_sched_ctx_iterator it;
+	unsigned worker;
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 	while(workers->has_next(workers, &it))
 	while(workers->has_next(workers, &it))
 	{
 	{
 		worker = workers->get_next(workers, &it);
 		worker = workers->get_next(workers, &it);
-		starpu_pthread_mutex_t *sched_mutex;
-		starpu_pthread_cond_t *sched_cond;
 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
 		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
 		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
 	}
 	}
@@ -368,6 +367,6 @@ struct starpu_sched_policy _starpu_sched_lws_policy =
 	.pre_exec_hook = NULL,
 	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
-	.policy_name = "nws",
-	.policy_description = "new work stealing"
+	.policy_name = "lws",
+	.policy_description = "locality work stealing"
 };
 };

+ 1 - 0
tests/Makefile.am

@@ -151,6 +151,7 @@ noinst_PROGRAMS =				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
 	datawizard/acquire_release2		\
+	datawizard/cache			\
 	datawizard/commute			\
 	datawizard/commute			\
 	datawizard/copy				\
 	datawizard/copy				\
 	datawizard/data_implicit_deps		\
 	datawizard/data_implicit_deps		\

+ 100 - 0
tests/datawizard/cache.c

@@ -0,0 +1,100 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+static void codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+     FPRINTF(stderr, "%lx\n", (unsigned long) STARPU_VARIABLE_GET_PTR(descr[0]));
+     FPRINTF(stderr, "codelet\n");
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+static struct starpu_codelet cuda_cl =
+{
+     .cuda_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static struct starpu_codelet opencl_cl =
+{
+     .opencl_funcs = {codelet, NULL},
+     .nbuffers = 1,
+     .modes = {STARPU_R}
+};
+#endif
+
+void dotest(struct starpu_codelet *cl)
+{
+     int ret;
+     int var = 42;
+     starpu_data_handle_t handle;
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+     starpu_data_unregister(handle);
+
+     starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+
+     ret = starpu_task_insert(cl, STARPU_R, handle, 0);
+     if (ret == -ENODEV) goto enodev;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+     starpu_task_wait_for_all();
+
+enodev:
+     starpu_data_unregister(handle);
+}
+
+int main(int argc, char **argv)
+{
+     int ret;
+
+     ret = starpu_init(NULL);
+     if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+     dotest(&cuda_cl);
+#endif
+#ifdef STARPU_USE_OPENCL
+     dotest(&opencl_cl);
+#endif
+
+     starpu_shutdown();
+
+     return 0;
+
+enodev:
+     starpu_shutdown();
+     /* yes, we do not perform the computation but we did detect that no one
+      * could perform the kernel, so this is not an error from StarPU */
+     fprintf(stderr, "WARNING: No one can execute this task\n");
+     return STARPU_TEST_SKIPPED;
+}

+ 9 - 10
tests/datawizard/gpu_ptr_register.c

@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 static int
 find_a_worker(enum starpu_worker_archtype type)
 find_a_worker(enum starpu_worker_archtype type)
 {
 {
-	int worker;
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int worker[STARPU_NMAXWORKERS];
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 	if (ret == 0)
 		return -ENODEV;
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 }
 
 
 static int
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 static int
 test_cuda(void)
 test_cuda(void)
 {
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 	size = 10 * n;
 
 
 	devid = starpu_worker_get_devid(chosen);
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
 
 
 	foo = calloc(size, sizeof(*foo));
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -180,9 +181,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 	starpu_opencl_get_queue(devid, &queue);
 
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
 
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -261,7 +260,7 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	ret = test_cuda();
 	if (ret == 1)
 	if (ret == 1)
 		goto fail;
 		goto fail;

+ 10 - 11
tests/datawizard/gpu_register.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  * Copyright (C) 2012 inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -55,11 +55,13 @@ submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 static int
 static int
 find_a_worker(enum starpu_worker_archtype type)
 find_a_worker(enum starpu_worker_archtype type)
 {
 {
-	int worker;
-	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	int worker[STARPU_NMAXWORKERS];
+	int ret = starpu_worker_get_ids_by_type(type, worker, STARPU_NMAXWORKERS);
 	if (ret == 0)
 	if (ret == 0)
 		return -ENODEV;
 		return -ENODEV;
-	return worker;
+	if (ret == -ERANGE)
+		return worker[STARPU_NMAXWORKERS-1];
+	return worker[ret-1];
 }
 }
 
 
 static int
 static int
@@ -78,7 +80,7 @@ check_result(unsigned *t, size_t size)
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000
+#ifdef HAVE_CUDA_MEMCPY_PEER
 static int
 static int
 test_cuda(void)
 test_cuda(void)
 {
 {
@@ -100,8 +102,7 @@ test_cuda(void)
 	size = 10 * n;
 	size = 10 * n;
 
 
 	devid = starpu_worker_get_devid(chosen);
 	devid = starpu_worker_get_devid(chosen);
-	starpu_cuda_set_device(devid);
-	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(*foo_gpu));
 
 
 	foo = calloc(size, sizeof(*foo));
 	foo = calloc(size, sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -182,9 +183,7 @@ test_opencl(void)
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_context(devid, &context);
 	starpu_opencl_get_queue(devid, &queue);
 	starpu_opencl_get_queue(devid, &queue);
 
 
-	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
-		STARPU_OPENCL_REPORT_ERROR(err);
+	foo_gpu = (void*) starpu_malloc_on_node(starpu_worker_get_memory_node(chosen), size * sizeof(int));
 
 
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	unsigned int *foo = malloc(size*sizeof(*foo));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -269,7 +268,7 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	ret = test_cuda();
 	if (ret == 1)
 	if (ret == 1)
 		goto fail;
 		goto fail;

+ 11 - 0
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -205,7 +205,18 @@ run(struct starpu_sched_policy *policy)
 	if (cpu_task_worker != STARPU_CPU_WORKER ||
 	if (cpu_task_worker != STARPU_CPU_WORKER ||
 			(gpu_task_worker != STARPU_CUDA_WORKER &&
 			(gpu_task_worker != STARPU_CUDA_WORKER &&
 			 gpu_task_worker != STARPU_OPENCL_WORKER))
 			 gpu_task_worker != STARPU_OPENCL_WORKER))
+	{
+		if (cpu_task_worker != STARPU_CPU_WORKER)
+		{
+			FPRINTF(stderr, "The CPU task did not run on a CPU worker\n");
+		}
+		if (gpu_task_worker != STARPU_CUDA_WORKER && gpu_task_worker != STARPU_OPENCL_WORKER)
+		{
+			FPRINTF(stderr, "The GPU task did not run on a Cuda or OpenCL worker\n");
+		}
+
 		ret = 1;
 		ret = 1;
+	}
 	else
 	else
 		ret = 0;
 		ret = 0;