浏览代码

merge from trunk

Corentin Salingue 8 年之前
父节点
当前提交
82d0b3921a
共有 50 个文件被更改,包括 679 次插入250 次删除
  1. 3 1
      ChangeLog
  2. 18 3
      doc/doxygen/chapters/210_check_list_performance.doxy
  3. 41 1
      doc/doxygen/chapters/501_environment_variables.doxy
  4. 124 10
      mpi/src/starpu_mpi.c
  5. 2 2
      mpi/src/starpu_mpi_cache.c
  6. 2 2
      mpi/src/starpu_mpi_cache_stats.c
  7. 30 5
      mpi/src/starpu_mpi_comm.c
  8. 24 0
      mpi/src/starpu_mpi_private.h
  9. 5 5
      mpi/src/starpu_mpi_stats.c
  10. 3 2
      mpi/src/starpu_mpi_stats.h
  11. 12 18
      mpi/src/starpu_mpi_task_insert.c
  12. 6 6
      mpi/src/starpu_mpi_task_insert_fortran.c
  13. 3 3
      src/common/fxt.c
  14. 2 2
      src/common/thread.c
  15. 3 3
      src/common/utils.c
  16. 10 3
      src/core/perfmodel/perfmodel_history.c
  17. 7 7
      src/core/sched_policy.c
  18. 32 0
      src/core/simgrid.c
  19. 2 0
      src/core/simgrid.h
  20. 4 4
      src/core/task.c
  21. 15 19
      src/core/topology.c
  22. 2 2
      src/core/workers.c
  23. 2 1
      src/core/workers.h
  24. 1 3
      src/datawizard/coherency.h
  25. 17 17
      src/datawizard/datastats.c
  26. 3 3
      src/datawizard/datastats.h
  27. 23 20
      src/datawizard/memalloc.c
  28. 15 15
      src/datawizard/memstats.c
  29. 2 2
      src/datawizard/memstats.h
  30. 2 1
      src/debug/starpu_debug_helpers.h
  31. 8 7
      src/debug/structures_size.c
  32. 8 8
      src/debug/traces/starpu_fxt.c
  33. 1 1
      src/debug/traces/starpu_fxt_dag.c
  34. 2 2
      src/debug/traces/starpu_fxt_mpi.c
  35. 6 6
      src/drivers/cuda/driver_cuda.c
  36. 4 4
      src/drivers/scc/driver_scc_common.c
  37. 1 1
      src/drivers/scc/driver_scc_sink.c
  38. 2 2
      src/drivers/scc/driver_scc_source.c
  39. 6 6
      src/profiling/bound.c
  40. 34 26
      src/profiling/profiling_helpers.c
  41. 4 5
      src/sched_policies/deque_modeling_policy_data_aware.c
  42. 1 2
      src/sched_policies/helper_mct.c
  43. 2 2
      src/sched_policies/parallel_eager.c
  44. 1 1
      src/sched_policies/scheduler_maker.c
  45. 6 6
      src/util/openmp_runtime_support_environment.c
  46. 5 5
      src/util/starpu_task_insert.c
  47. 5 1
      tests/Makefile.am
  48. 2 1
      tests/microbenchs/display_structures_size.c
  49. 162 0
      tests/sched_ctx/sched_ctx_hierarchy.c
  50. 4 4
      tools/cppcheck/suppressions.txt

+ 3 - 1
ChangeLog

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2016  Université de Bordeaux
+# Copyright (C) 2009-2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
 # Copyright (C) 2014, 2016 INRIA
 #
@@ -44,6 +44,8 @@ New features:
   * Add starpu_data_set_user_data and starpu_data_get_user_data.
   * Add STARPU_MPI_FAKE_SIZE and STARPU_MPI_FAKE_RANK to allow simulating
     execution of just one MPI node.
+  * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
+    models between devices, making calibration much faster.
 
 StarPU 1.2.0 (svn revision 18521)
 ==============================================

+ 18 - 3
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -246,9 +246,24 @@ to configure a performance model for the codelets of the application (see
 use on-line calibration.  StarPU will automatically calibrate codelets
 which have never been calibrated yet, and save the result in
 <c>$STARPU_HOME/.starpu/sampling/codelets</c>.
-The models are indexed by machine name. To share the models between
-machines (e.g. for a homogeneous cluster), use <c>export
-STARPU_HOSTNAME=some_global_name</c> (\ref STARPU_HOSTNAME). To force continuing calibration,
+The models are indexed by machine name.
+
+By default, StarPU stores separate performance models according to the hostname
+of the system. To avoid having to calibrate performance models for each node
+of a homogeneous cluster for instance, the model can be shared by using
+<c>export STARPU_HOSTNAME=some_global_name</c> (\ref STARPU_HOSTNAME), where
+<c>some_global_name</c> is the name of the cluster for instance, which thus
+overrides the hostname of the system.
+
+By default, StarPU stores separate performance models for each GPU. To avoid
+having to calibrate performance models for each GPU of a homogeneous set of GPU
+devices for instance, the model can be shared by setting
+<c>export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1</c> ,
+<c>export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1</c> ,
+<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MIC=1</c> , or
+<c>export STARPU_PERF_MODEL_HOMOGENEOUS_SCC=1</c> (depending on your GPU device type).
+
+To force continuing calibration,
 use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necessary if your application
 has not-so-stable performance. StarPU will force calibration (and thus ignore
 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been

+ 41 - 1
doc/doxygen/chapters/501_environment_variables.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012, 2016 INRIA
  * Copyright (C) 2016 Uppsala University
  * See the file version.doxy for copying conditions.
@@ -622,6 +622,46 @@ This specifies the main directory in which StarPU stores its
 performance model files. The default is <c>$STARPU_HOME/.starpu/sampling</c>.
 </dd>
 
+<dt>STARPU_PERF_MODEL_HOMEGENEOUS_CUDA</dt>
+<dd>
+\anchor STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
+\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
+When this is set to 1, StarPU will assume that all CUDA devices have the same
+performance, and thus share performance models for them, thus allowing kernel
+calibration to be much faster, since measurements only have to be once for all
+CUDA GPUs.
+</dd>
+
+<dt>STARPU_PERF_MODEL_HOMEGENEOUS_OPENCL</dt>
+<dd>
+\anchor STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
+\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
+When this is set to 1, StarPU will assume that all OPENCL devices have the same
+performance, and thus share performance models for them, thus allowing kernel
+calibration to be much faster, since measurements only have to be once for all
+OPENCL GPUs.
+</dd>
+
+<dt>STARPU_PERF_MODEL_HOMEGENEOUS_MIC</dt>
+<dd>
+\anchor STARPU_PERF_MODEL_HOMOGENEOUS_MIC
+\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_MIC
+When this is set to 1, StarPU will assume that all MIC devices have the same
+performance, and thus share performance models for them, thus allowing kernel
+calibration to be much faster, since measurements only have to be once for all
+MIC GPUs.
+</dd>
+
+<dt>STARPU_PERF_MODEL_HOMEGENEOUS_SCC</dt>
+<dd>
+\anchor STARPU_PERF_MODEL_HOMOGENEOUS_SCC
+\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_SCC
+When this is set to 1, StarPU will assume that all SCC devices have the same
+performance, and thus share performance models for them, thus allowing kernel
+calibration to be much faster, since measurements only have to be once for all
+SCC GPUs.
+</dd>
+
 <dt>STARPU_HOSTNAME</dt>
 <dd>
 \anchor STARPU_HOSTNAME

+ 124 - 10
mpi/src/starpu_mpi.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2016  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -76,6 +76,10 @@ static int running = 0;
 #ifdef STARPU_SIMGRID
 static int _mpi_world_size;
 static int _mpi_world_rank;
+
+static int wait_counter;
+static starpu_pthread_cond_t wait_counter_cond;
+static starpu_pthread_mutex_t wait_counter_mutex;
 #endif
 int _starpu_mpi_fake_world_size = -1;
 int _starpu_mpi_fake_world_rank = -1;
@@ -143,6 +147,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	(*req)->early_data_handle = NULL;
 	(*req)->envelope = NULL;
 	(*req)->sequential_consistency = 1;
+
+#ifdef STARPU_SIMGRID
+	starpu_pthread_queue_init(&((*req)->queue));
+	starpu_pthread_queue_register(&wait, &((*req)->queue));
+	(*req)->done = 0;
+#endif
 }
 
 static void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req)
@@ -153,6 +163,10 @@ static void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req)
 	STARPU_PTHREAD_COND_DESTROY(&req->posted_cond);
 	free(req->datatype_name);
 	req->datatype_name = NULL;
+#ifdef STARPU_SIMGRID
+	starpu_pthread_queue_unregister(&wait, &req->queue);
+	starpu_pthread_queue_destroy(&req->queue);
+#endif
 	free(req);
 	req = NULL;
 }
@@ -294,6 +308,9 @@ static void _starpu_mpi_submit_ready_request(void *arg)
 
 	newer_requests = 1;
 	STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+#ifdef STARPU_SIMGRID
+	starpu_pthread_queue_signal(&dontsleep);
+#endif
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_MPI_LOG_OUT();
 }
@@ -350,6 +367,55 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 	return req;
  }
 
+#ifdef STARPU_SIMGRID
+int _starpu_mpi_simgrid_mpi_test(int *done, int *flag)
+{
+	*flag = 0;
+	if (*done)
+	{
+		starpu_pthread_queue_signal(&dontsleep);
+		*flag = 1;
+	}
+	return MPI_SUCCESS;
+}
+static void* _starpu_mpi_simgrid_wait_req_func(void* arg)
+{
+	struct _starpu_simgrid_mpi_req *sim_req = arg;
+	int ret;
+	STARPU_PTHREAD_MUTEX_LOCK(&wait_counter_mutex);
+	wait_counter++;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&wait_counter_mutex);
+
+	ret = MPI_Wait(sim_req->request, sim_req->status);
+
+	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(ret));
+
+	*(sim_req->done) = 1;
+	starpu_pthread_queue_signal(sim_req->queue);
+
+	free(sim_req);
+
+	STARPU_PTHREAD_MUTEX_LOCK(&wait_counter_mutex);
+	if (--wait_counter == 0)
+		STARPU_PTHREAD_COND_SIGNAL(&wait_counter_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&wait_counter_mutex);
+
+	return NULL;
+}
+void _starpu_mpi_simgrid_wait_req(MPI_Request *request, MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done)
+{
+	struct _starpu_simgrid_mpi_req *sim_req;
+	_STARPU_MPI_CALLOC(sim_req, 1, sizeof(struct _starpu_simgrid_mpi_req));
+	sim_req->request = request;
+	sim_req->status = status;
+	sim_req->queue = queue;
+	sim_req->done = done;
+	*done = 0;
+
+	_starpu_simgrid_xbt_thread_create("wait for mpi transfer", _starpu_mpi_simgrid_wait_req_func, sim_req);
+}
+#endif
+
  /********************************************************/
  /*                                                      */
  /*  Send functionalities                                */
@@ -379,6 +445,10 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Issend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
 
+#ifdef STARPU_SIMGRID
+	_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
+#endif
+
 	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, 0);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
@@ -571,6 +641,9 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 	{
 		_STARPU_MPI_COMM_FROM_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
+#ifdef STARPU_SIMGRID
+		_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
+#endif
 	}
 	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
@@ -682,6 +755,10 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 	if (req->data_request != MPI_REQUEST_NULL)
 	{
+		// TODO: Fix for STARPU_SIMGRID
+#ifdef STARPU_SIMGRID
+		STARPU_MPI_ASSERT_MSG(0, "Implement this in STARPU_SIMGRID");
+#endif
 		req->ret = MPI_Wait(&req->data_request, waiting_req->status);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
@@ -755,7 +832,13 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 
+#ifdef STARPU_SIMGRID
+	req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, testing_req->flag);
+	memcpy(testing_req->status, &req->status_store, sizeof(*testing_req->status));
+#else
 	req->ret = MPI_Test(&req->data_request, testing_req->flag, testing_req->status);
+#endif
+
 	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
 	_STARPU_MPI_TRACE_UTESTING_END(req->node_tag.rank, req->node_tag.data_tag);
@@ -1095,7 +1178,6 @@ static void _starpu_mpi_test_detached_requests(void)
 {
 	//_STARPU_MPI_LOG_IN();
 	int flag;
-	MPI_Status status;
 	struct _starpu_mpi_req *req;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1106,7 +1188,11 @@ static void _starpu_mpi_test_detached_requests(void)
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %d - TYPE %s %d\n", &req->data_request, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->node_tag.rank);
-		req->ret = MPI_Test(&req->data_request, &flag, &status);
+#ifdef STARPU_SIMGRID
+		req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, &flag);
+#else
+		req->ret = MPI_Test(&req->data_request, &flag, MPI_STATUS_IGNORE);
+#endif
 
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
@@ -1354,6 +1440,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
+#ifdef STARPU_SIMGRID
+	starpu_pthread_wait_init(&wait);
+	starpu_pthread_queue_init(&dontsleep);
+	starpu_pthread_queue_register(&wait, &dontsleep);
+#endif
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
@@ -1361,6 +1452,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(ready_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))// || !(_starpu_mpi_early_request_count()) || !(_starpu_mpi_sync_data_count()))
 	{
+#ifdef STARPU_SIMGRID
+		starpu_pthread_wait_reset(&wait);
+#endif
 		/* shall we block ? */
 		unsigned block = _starpu_mpi_req_list_empty(ready_requests) && _starpu_mpi_early_request_count() == 0 && _starpu_mpi_sync_data_count() == 0;
 
@@ -1522,7 +1616,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		}
 #ifdef STARPU_SIMGRID
 		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-		MSG_process_sleep(0.000010);
+		starpu_pthread_wait_wait(&wait);
 		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 #endif
 	}
@@ -1533,6 +1627,21 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		envelope_request_submitted = 0;
 	}
 
+
+#ifdef STARPU_SIMGRID
+	STARPU_PTHREAD_MUTEX_LOCK(&wait_counter_mutex);
+	while (wait_counter != 0)
+		STARPU_PTHREAD_COND_WAIT(&wait_counter_cond, &wait_counter_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&wait_counter_mutex);
+
+	STARPU_PTHREAD_MUTEX_DESTROY(&wait_counter_mutex);
+	STARPU_PTHREAD_COND_DESTROY(&wait_counter_cond);
+
+	starpu_pthread_queue_unregister(&wait, &dontsleep);
+	starpu_pthread_queue_destroy(&dontsleep);
+	starpu_pthread_wait_destroy(&wait);
+#endif
+
 	STARPU_MPI_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_MPI_ASSERT_MSG(_starpu_mpi_req_list_empty(ready_requests), "List of ready requests not empty");
 	STARPU_MPI_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
@@ -1628,6 +1737,11 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm
 	STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
 	_starpu_mpi_comm = starpu_getenv("STARPU_MPI_COMM") != NULL;
 
+#ifdef STARPU_SIMGRID
+	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT(&wait_counter_cond, NULL);
+#endif
+
 #ifdef STARPU_MPI_ACTIVITY
 	hookid = starpu_progression_hook_register(_starpu_mpi_progression_hook_func, NULL);
 	STARPU_MPI_ASSERT_MSG(hookid >= 0, "starpu_progression_hook_register failed");
@@ -1721,6 +1835,9 @@ int starpu_mpi_shutdown(void)
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	running = 0;
 	STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+#ifdef STARPU_SIMGRID
+	starpu_pthread_queue_signal(&dontsleep);
+#endif
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 #ifndef STARPU_SIMGRID
@@ -1740,7 +1857,7 @@ int starpu_mpi_shutdown(void)
 	_starpu_mpi_req_list_delete(detached_requests);
 	_starpu_mpi_req_list_delete(ready_requests);
 
-	_starpu_mpi_comm_amounts_display(rank);
+	_starpu_mpi_comm_amounts_display(stderr, rank);
 	_starpu_mpi_comm_amounts_free();
 	_starpu_mpi_cache_free(world_size);
 	_starpu_mpi_tag_free();
@@ -1857,13 +1974,11 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	tag = starpu_mpi_data_get_tag(data_handle);
 	if (rank == -1)
 	{
-		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
-		STARPU_ABORT();
+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 	}
 	if (tag == -1)
 	{
-		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
-		STARPU_ABORT();
+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 	}
 	starpu_mpi_comm_rank(comm, &me);
 
@@ -1968,4 +2083,3 @@ int starpu_mpi_wait_for_all(MPI_Comm comm)
 	}
 	return 0;
 }
-

+ 2 - 2
mpi/src/starpu_mpi_cache.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011-2016  Université de Bordeaux
  * Copyright (C) 2014 INRIA
  *
@@ -74,7 +74,7 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
 	if (_starpu_cache_enabled == 0)
 	{
-		if (!_starpu_silent) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
+		_STARPU_DISP("Warning: StarPU MPI Communication cache is disabled\n");
 		return;
 	}
 

+ 2 - 2
mpi/src/starpu_mpi_cache_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2014, 2015, 2016  CNRS
+ * Copyright (C) 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,7 +33,7 @@ void _starpu_mpi_cache_stats_init(MPI_Comm comm)
 	}
 	if (stats_enabled == 0) return;
 
-	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
 
 	starpu_mpi_comm_size(comm, &world_size);
 	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);

+ 30 - 5
mpi/src/starpu_mpi_comm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011-2016  Université de Bordeaux
  * Copyright (C) 2014 INRIA
  *
@@ -28,6 +28,12 @@ struct _starpu_mpi_comm
 	struct _starpu_mpi_envelope *envelope;
 	MPI_Request request;
 	int posted;
+
+#ifdef STARPU_SIMGRID
+	MPI_Status status;
+	starpu_pthread_queue_t queue;
+	unsigned done;
+#endif
 };
 struct _starpu_mpi_comm_hashtable
 {
@@ -62,6 +68,10 @@ void _starpu_mpi_comm_free()
 	{
 		struct _starpu_mpi_comm *_comm = _starpu_mpi_comms[i]; // get the ith _comm;
 		free(_comm->envelope);
+#ifdef STARPU_SIMGRID
+		starpu_pthread_queue_unregister(&wait, &_comm->queue);
+		starpu_pthread_queue_destroy(&_comm->queue);
+#endif
 		free(_comm);
 	}
 	free(_starpu_mpi_comms);
@@ -106,6 +116,12 @@ void _starpu_mpi_comm_register(MPI_Comm comm)
 		_STARPU_MPI_MALLOC(entry, sizeof(*entry));
 		entry->comm = comm;
 		HASH_ADD(hh, _starpu_mpi_comms_cache, comm, sizeof(entry->comm), entry);
+
+#ifdef STARPU_SIMGRID
+		starpu_pthread_queue_init(&_comm->queue);
+		starpu_pthread_queue_register(&wait, &_comm->queue);
+		_comm->done = 0;
+#endif
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_comms_mutex);
 }
@@ -123,6 +139,9 @@ void _starpu_mpi_comm_post_recv()
 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop on comm %d %d\n", i, _comm->comm);
 			_STARPU_MPI_COMM_FROM_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm);
 			MPI_Irecv(_comm->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm, &_comm->request);
+#ifdef STARPU_SIMGRID
+			_starpu_mpi_simgrid_wait_req(&_comm->request, &_comm->status, &_comm->queue, &_comm->done);
+#endif
 			_comm->posted = 1;
 		}
 	}
@@ -143,9 +162,11 @@ int _starpu_mpi_comm_test_recv(MPI_Status *status, struct _starpu_mpi_envelope *
 			int flag, res;
 			/* test whether an envelope has arrived. */
 #ifdef STARPU_SIMGRID
-			MSG_process_sleep(0.000001);
-#endif
+			res = _starpu_mpi_simgrid_mpi_test(&_comm->done, &flag);
+			memcpy(status, &_comm->status, sizeof(*status));
+#else
 			res = MPI_Test(&_comm->request, &flag, status);
+#endif
 			STARPU_ASSERT(res == MPI_SUCCESS);
 			if (flag)
 			{
@@ -182,9 +203,13 @@ void _starpu_mpi_comm_cancel_recv()
 		struct _starpu_mpi_comm *_comm = _starpu_mpi_comms[i]; // get the ith _comm;
 		if (_comm->posted == 1)
 		{
-			MPI_Status status;
 			MPI_Cancel(&_comm->request);
-			MPI_Wait(&_comm->request, &status);
+#ifndef STARPU_SIMGRID
+			{
+				MPI_Status status;
+				MPI_Wait(&_comm->request, &status);
+			}
+#endif
 			_comm->posted = 0;
 		}
 	}

+ 24 - 0
mpi/src/starpu_mpi_private.h

@@ -24,11 +24,28 @@
 #include "starpu_mpi.h"
 #include "starpu_mpi_fxt.h"
 #include <common/list.h>
+#include <core/simgrid.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+	
+#ifdef STARPU_SIMGRID
+starpu_pthread_wait_t wait;
+starpu_pthread_queue_t dontsleep;
 
+struct _starpu_simgrid_mpi_req
+{
+	MPI_Request *request;
+	MPI_Status *status;
+	starpu_pthread_queue_t *queue;
+	unsigned *done;
+};
+
+int _starpu_mpi_simgrid_mpi_test(int *done, int *flag);
+void _starpu_mpi_simgrid_wait_req(MPI_Request *request, 	MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done);
+#endif
+	
 extern int _starpu_debug_rank;
 char *_starpu_mpi_get_mpi_error_code(int code);
 extern int _starpu_mpi_comm;
@@ -224,6 +241,13 @@ LIST_TYPE(_starpu_mpi_req,
 	int sequential_consistency;
 
      	UT_hash_handle hh;
+
+#ifdef STARPU_SIMGRID
+        MPI_Status status_store;
+	starpu_pthread_queue_t queue;
+	unsigned done;
+#endif
+	  
 );
 
 struct _starpu_mpi_argc_argv

+ 5 - 5
mpi/src/starpu_mpi_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012, 2013, 2016  CNRS
+ * Copyright (C) 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -34,7 +34,7 @@ void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 
 	if (stats_enabled == 0) return;
 
-	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
 
 	starpu_mpi_comm_size(comm, &world_size);
 	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
@@ -68,7 +68,7 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 	memcpy(comm_amounts, comm_amount, world_size * sizeof(size_t));
 }
 
-void _starpu_mpi_comm_amounts_display(int node)
+void _starpu_mpi_comm_amounts_display(FILE *stream, int node)
 {
 	int dst;
 	size_t sum = 0;
@@ -80,13 +80,13 @@ void _starpu_mpi_comm_amounts_display(int node)
 		sum += comm_amount[dst];
 	}
 
-	fprintf(stderr, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
+	fprintf(stream, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
 
 	for (dst = 0; dst < world_size; dst++)
 	{
 		if (comm_amount[dst])
 		{
-			fprintf(stderr, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\n",
+			fprintf(stream, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\n",
 				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
 		}
 	}

+ 3 - 2
mpi/src/starpu_mpi_stats.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  CNRS
+ * Copyright (C) 2012, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,7 @@
 #ifndef __STARPU_MPI_STATS_H__
 #define __STARPU_MPI_STATS_H__
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 
@@ -27,7 +28,7 @@ extern "C" {
 void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
 void _starpu_mpi_comm_amounts_free();
 void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count);
-void _starpu_mpi_comm_amounts_display(int node);
+void _starpu_mpi_comm_amounts_display(FILE *stream, int node);
 
 #ifdef __cplusplus
 }

+ 12 - 18
mpi/src/starpu_mpi_task_insert.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011-2016  Université de Bordeaux
  * Copyright (C) 2014, 2016 Inria
  *
@@ -84,13 +84,11 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 		int data_tag = starpu_mpi_data_get_tag(data);
 		if (mpi_rank == -1)
 		{
-			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
-			STARPU_ABORT();
+			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 		}
 		if (data_tag == -1)
 		{
-			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
-			STARPU_ABORT();
+			_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 		}
 
 		if (do_execute && mpi_rank != me)
@@ -128,13 +126,11 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 		int data_tag = starpu_mpi_data_get_tag(data);
 		if(mpi_rank == -1)
 		{
-			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
-			STARPU_ABORT();
+			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 		}
 		if(data_tag == -1)
 		{
-			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
-			STARPU_ABORT();
+			_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 		}
 		if (mpi_rank == me)
 		{
@@ -520,11 +516,11 @@ int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_
 
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
-			fprintf(stderr, "submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
-				task, task->cl,
-				(codelet == NULL) ? "none" :
-				task->cl->name ? task->cl->name :
-				(task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
+			_STARPU_MSG("submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
+				    task, task->cl,
+				    (codelet == NULL) ? "none" :
+				    task->cl->name ? task->cl->name :
+				    (task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
 
 			task->destroy = 0;
 			starpu_task_destroy(task);
@@ -668,13 +664,11 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 	tag = starpu_mpi_data_get_tag(data_handle);
 	if (rank == -1)
 	{
-		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
-		STARPU_ABORT();
+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 	}
 	if (tag == -1)
 	{
-		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
-		STARPU_ABORT();
+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 	}
 
 	starpu_mpi_comm_rank(comm, &me);

+ 6 - 6
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  * Copyright (C) 2016 Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -366,11 +366,11 @@ int _fstarpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, vo
 
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
-			fprintf(stderr, "submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
-				task, task->cl,
-				(codelet == NULL) ? "none" :
-				task->cl->name ? task->cl->name :
-				(task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
+			_STARPU_MSG("submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
+				    task, task->cl,
+				    (codelet == NULL) ? "none" :
+				    task->cl->name ? task->cl->name :
+				    (task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
 
 			task->destroy = 0;
 			starpu_task_destroy(task);

+ 3 - 3
src/common/fxt.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -199,7 +199,7 @@ void _starpu_stop_fxt_profiling(void)
 #ifdef STARPU_VERBOSE
 	        char hostname[128];
 		gethostname(hostname, 128);
-		fprintf(stderr, "Writing FxT traces into file %s:%s\n", hostname, _STARPU_PROF_FILE_USER);
+		_STARPU_MSG("Writing FxT traces into file %s:%s\n", hostname, _STARPU_PROF_FILE_USER);
 #endif
 		fut_endup(_STARPU_PROF_FILE_USER);
 
@@ -213,7 +213,7 @@ void _starpu_stop_fxt_profiling(void)
 		{
 			/* Something went wrong with the FxT trace (eg. there
 			 * was too many events) */
-			fprintf(stderr, "Warning: the FxT trace could not be generated properly\n");
+			_STARPU_MSG("Warning: the FxT trace could not be generated properly\n");
 		}
 
 		_starpu_written = 1;

+ 2 - 2
src/common/thread.c

@@ -215,7 +215,7 @@ int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
 {
 	void **array;
 #ifdef STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE
-	if (SIMIX_process_get_code() == _starpu_mpi_simgrid_init)
+	if ((SIMIX_process_get_code() == _starpu_mpi_simgrid_init) || (!strcmp(SIMIX_process_self_get_name(),"wait for mpi transfer")))
 		/* Special-case the SMPI process */
 		array = smpi_process_get_user_data();
 	else
@@ -229,7 +229,7 @@ void* starpu_pthread_getspecific(starpu_pthread_key_t key)
 {
 	void **array;
 #ifdef STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE
-	if (SIMIX_process_get_code() == _starpu_mpi_simgrid_init)
+	if ((SIMIX_process_get_code() == _starpu_mpi_simgrid_init) || (!strcmp(SIMIX_process_self_get_name(),"wait for mpi transfer")))
 		/* Special-case the SMPI process */
 		array = smpi_process_get_user_data();
 	else

+ 3 - 3
src/common/utils.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -104,7 +104,7 @@ int _starpu_mkpath(const char *s, mode_t mode)
 	{
 		if (!S_ISDIR(sb.st_mode))
 		{
-			fprintf(stderr,"Error: %s is not a directory:\n", path);
+			_STARPU_MSG("Error: %s is not a directory:\n", path);
 			STARPU_ABORT();
 		}
 		/* It already exists and is a directory.  */
@@ -137,7 +137,7 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 
 	if (ret == -1 && errno != EEXIST)
 	{
-		fprintf(stderr,"Error making StarPU directory %s:\n", path);
+		_STARPU_MSG("Error making StarPU directory %s:\n", path);
 		perror("mkdir");
 		STARPU_ABORT();
 	}

+ 10 - 3
src/core/perfmodel/perfmodel_history.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2016  Inria
  *
@@ -50,6 +50,7 @@ static int current_arch_comb;
 static int nb_arch_combs;
 static starpu_pthread_rwlock_t arch_combs_mutex;
 static int historymaxerror;
+static char ignore_devid[STARPU_ANY_WORKER];
 
 /* How many executions a codelet will have to be measured before we
  * consider that calibration will provide a value good enough for scheduling */
@@ -108,7 +109,8 @@ int _starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device
 				for(dev2 = 0; dev2 < ndevices; dev2++)
 				{
 					if(arch_combs[comb]->devices[dev1].type == devices[dev2].type &&
-					   arch_combs[comb]->devices[dev1].devid == devices[dev2].devid &&
+					   (ignore_devid[devices[dev2].type] ||
+					    arch_combs[comb]->devices[dev1].devid == devices[dev2].devid) &&
 					   arch_combs[comb]->devices[dev1].ncores == devices[dev2].ncores)
 						nfounded++;
 				}
@@ -917,6 +919,11 @@ void _starpu_initialize_registered_performance_models(void)
 	STARPU_PTHREAD_RWLOCK_INIT(&arch_combs_mutex, NULL);
 	historymaxerror = starpu_get_env_number_default("STARPU_HISTORY_MAX_ERROR", STARPU_HISTORYMAXERROR);
 	_starpu_calibration_minimum = starpu_get_env_number_default("STARPU_CALIBRATE_MINIMUM", 10);
+	/* ignore_devid[STARPU_CPU_WORKER]; */ /* Always true for now */
+	ignore_devid[STARPU_CUDA_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CUDA", 0);
+	ignore_devid[STARPU_OPENCL_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL", 0);
+	ignore_devid[STARPU_MIC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MIC", 0);
+	ignore_devid[STARPU_SCC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_SCC", 0);
 }
 
 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model)
@@ -1106,7 +1113,7 @@ int starpu_perfmodel_list(FILE *output)
         }
 	return 0;
 #else
-	fprintf(stderr,"Listing perfmodels is not implemented on pure Windows yet\n");
+	_STARPU_MSG("Listing perfmodels is not implemented on pure Windows yet\n");
 	return 1;
 #endif
 }

+ 7 - 7
src/core/sched_policy.c

@@ -144,7 +144,7 @@ static struct starpu_sched_policy *find_sched_policy_from_name(const char *polic
 	return NULL;
 }
 
-static void display_sched_help_message(void)
+static void display_sched_help_message(FILE *stream)
 {
 	const char *sched_env = starpu_getenv("STARPU_SCHED");
 	if (sched_env && (strcmp(sched_env, "help") == 0))
@@ -152,13 +152,13 @@ static void display_sched_help_message(void)
 		/* display the description of all predefined policies */
 		struct starpu_sched_policy **policy;
 
-		fprintf(stderr, "\nThe variable STARPU_SCHED can be set to one of the following strings:\n");
+		fprintf(stream, "\nThe variable STARPU_SCHED can be set to one of the following strings:\n");
 		for(policy=predefined_policies ; *policy!=NULL ; policy++)
 		{
 			struct starpu_sched_policy *p = *policy;
-			fprintf(stderr, "%-30s\t-> %s\n", p->policy_name, p->policy_description);
+			fprintf(stream, "%-30s\t-> %s\n", p->policy_name, p->policy_description);
 		}
-		fprintf(stderr, "\n");
+		fprintf(stream, "\n");
 	 }
 }
 
@@ -197,7 +197,7 @@ struct starpu_sched_policy *_starpu_select_sched_policy(struct _starpu_machine_c
 void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _starpu_sched_ctx *sched_ctx, struct starpu_sched_policy *selected_policy)
 {
 	/* Perhaps we have to display some help */
-	display_sched_help_message();
+	display_sched_help_message(stderr);
 
 	/* Prefetch is activated by default */
 	use_prefetch = starpu_get_env_number("STARPU_PREFETCH");
@@ -598,7 +598,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
 		if(ret == -1)
 		{
-			fprintf(stderr, "repush task \n");
+			_STARPU_MSG("repush task \n");
 			_STARPU_TRACE_JOB_POP(task, task->priority > 0);
 			ret = _starpu_push_task_to_workers(task);
 		}
@@ -1128,7 +1128,7 @@ void _starpu_print_idle_time()
 	f = fopen(starpu_idle_file, "a");
 	if (!f)
 	{
-		fprintf(stderr, "couldn't open %s: %s\n", starpu_idle_file, strerror(errno));
+		_STARPU_MSG("couldn't open %s: %s\n", starpu_idle_file, strerror(errno));
 	}
 	else
 	{

+ 32 - 0
src/core/simgrid.c

@@ -31,6 +31,7 @@
 
 #ifdef STARPU_SIMGRID
 #include <sys/resource.h>
+#include <simgrid/simix.h>
 
 #pragma weak starpu_main
 extern int starpu_main(int argc, char *argv[]);
@@ -803,4 +804,35 @@ void _starpu_simgrid_count_ngpus(void)
 		}
 #endif
 }
+
+typedef struct{
+  void_f_pvoid_t code;
+  void *userparam;
+  void *father_data;
+} thread_data_t;
+
+static int _starpu_simgrid_xbt_thread_create_wrapper(int argc, char *argv[])
+{
+  smx_process_t self = SIMIX_process_self();
+  thread_data_t *t = SIMIX_process_self_get_data(self);
+  simcall_process_set_data(self, t->father_data);
+  t->code(t->userparam);
+  simcall_process_set_data(self, NULL);
+  free(t);
+  
+  return 0;
+}
+
+void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code, void *param)
+{
+  thread_data_t *res = malloc(sizeof(thread_data_t));
+  res->userparam = param;
+  res->code = code;
+  res->father_data = SIMIX_process_self_get_data(SIMIX_process_self());
+
+  simcall_process_create(name,
+                           _starpu_simgrid_xbt_thread_create_wrapper, res,
+                           SIMIX_host_self_get_name(), -1.0, 0, NULL,
+                           /*props */ NULL,0);
+}
 #endif

+ 2 - 0
src/core/simgrid.h

@@ -68,6 +68,8 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
  * bus */
 void _starpu_simgrid_count_ngpus(void);
 
+void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
+				       void *param);
 #endif
 
 #endif // __SIMGRID_H__

+ 4 - 4
src/core/task.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011, 2014, 2016  INRIA
  * Copyright (C) 2016  Uppsala University
@@ -1241,14 +1241,14 @@ static void *watchdog_func(void *arg)
 		if (!config->watchdog_ok && last_nsubmitted
 				&& last_nsubmitted == starpu_task_nsubmitted())
 		{
-			fprintf(stderr,"The StarPU watchdog detected that no task finished for %fs (can be configure through STARPU_WATCHDOG_TIMEOUT)\n", timeout);
+			_STARPU_MSG("The StarPU watchdog detected that no task finished for %fs (can be configured through STARPU_WATCHDOG_TIMEOUT)\n", timeout);
 			if (watchdog_crash)
 			{
-				fprintf(stderr,"Crashing the process\n");
+				_STARPU_MSG("Crashing the process\n");
 				raise(SIGABRT);
 			}
 			else
-				fprintf(stderr,"Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
+				_STARPU_MSG("Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
 		}
 		/* Only shout again after another period */
 		config->watchdog_ok = 1;

+ 15 - 19
src/core/topology.c

@@ -477,10 +477,10 @@ _starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
 
 	if (0 != mic_file_found)
 	{
-		fprintf(stderr, "No MIC program specified, use the environment\n"
-			"variable STARPU_MIC_SINK_PROGRAM_NAME or the environment\n"
-			"or the field 'starpu_conf.mic_sink_program_path'\n"
-			"to define it.\n");
+		_STARPU_MSG("No MIC program specified, use the environment\n"
+			    "variable STARPU_MIC_SINK_PROGRAM_NAME or the environment\n"
+			    "or the field 'starpu_conf.mic_sink_program_path'\n"
+			    "to define it.\n");
 
 		return -1;
 	}
@@ -865,9 +865,7 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 		if ((unsigned) nmiccores > topology->nhwmiccores[mic_idx])
 		{
 			/* The user requires more MIC cores than there is available */
-			fprintf(stderr,
-				"# Warning: %d MIC cores requested. Only %d available.\n",
-				nmiccores, topology->nhwmiccores[mic_idx]);
+			_STARPU_MSG("# Warning: %d MIC cores requested. Only %d available.\n", nmiccores, topology->nhwmiccores[mic_idx]);
 			nmiccores = topology->nhwmiccores[mic_idx];
 		}
 	}
@@ -992,17 +990,15 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
              * detected mic devices. ! */
             reqmicdevices = nhwmicdevices;
 
-        if (reqmicdevices != -1)
-        {
-            if ((unsigned) reqmicdevices > nhwmicdevices)
-            {
-                /* The user requires more MIC devices than there is available */
-                fprintf(stderr,
-                    "# Warning: %d MIC devices requested. Only %d available.\n",
-                    reqmicdevices, nhwmicdevices);
-                reqmicdevices = nhwmicdevices;
-            }
-        }
+	if (reqmicdevices != -1)
+	{
+		if ((unsigned) reqmicdevices > nhwmicdevices)
+		{
+			/* The user requires more MIC devices than there is available */
+			_STARPU_MSG("# Warning: %d MIC devices requested. Only %d available.\n", reqmicdevices, nhwmicdevices);
+			reqmicdevices = nhwmicdevices;
+		}
+	}
 
         topology->nmicdevices = 0;
         unsigned i;
@@ -1624,7 +1620,7 @@ _starpu_bind_thread_on_cpu (
 	if (ret)
 	{
 		const char *msg = strerror(ret);
-		fprintf(stderr, "pthread_setaffinity_np: %s\n", msg);
+		_STARPU_MSG("pthread_setaffinity_np: %s\n", msg);
 		STARPU_ABORT();
 	}
 

+ 2 - 2
src/core/workers.c

@@ -1649,8 +1649,8 @@ void starpu_shutdown(void)
 	     int stats = starpu_get_env_number("STARPU_STATS");
 	     if (stats != 0)
 	     {
-		  _starpu_display_msi_stats();
-		  _starpu_display_alloc_cache_stats();
+		  _starpu_display_msi_stats(stderr);
+		  _starpu_display_alloc_cache_stats(stderr);
 	     }
 	}
 

+ 2 - 1
src/core/workers.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2011, 2016  INRIA
  * Copyright (C) 2016  Uppsala University
  *
@@ -599,6 +599,7 @@ static inline int _starpu_worker_get_id(void)
 #define starpu_worker_get_id _starpu_worker_get_id
 
 /* Similar behaviour to starpu_worker_get_id() but fails when called from outside a worker */
+/* This returns an unsigned object on purpose, so that the caller is sure to get a positive value */
 static inline unsigned __starpu_worker_get_id_check(const char *f, int l)
 {
 	(void) l;

+ 1 - 3
src/datawizard/coherency.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2017  CNRS
  * Copyright (C) 2014-2016  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -271,8 +271,6 @@ struct _starpu_data_state
 	void *user_data;
 };
 
-void _starpu_display_msi_stats(void);
-
 /* This does not take a reference on the handle, the caller has to do it,
  * e.g. through _starpu_attempt_to_submit_data_request_from_apps()
  * detached means that the core is allowed to drop the request. The caller

+ 17 - 17
src/datawizard/datastats.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2013, 2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,7 +43,7 @@ void __starpu_msi_cache_miss(unsigned node)
 	miss_cnt[node]++;
 }
 
-void _starpu_display_msi_stats(void)
+void _starpu_display_msi_stats(FILE *stream)
 {
 	if (!starpu_enable_stats())
 		return;
@@ -52,8 +52,8 @@ void _starpu_display_msi_stats(void)
 	unsigned total_hit_cnt = 0;
 	unsigned total_miss_cnt = 0;
 
-	fprintf(stderr, "\n#---------------------\n");
-	fprintf(stderr, "MSI cache stats :\n");
+	fprintf(stream, "\n#---------------------\n");
+	fprintf(stream, "MSI cache stats :\n");
 
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
@@ -61,18 +61,18 @@ void _starpu_display_msi_stats(void)
 		total_miss_cnt += miss_cnt[node];
 	}
 
-	fprintf(stderr, "TOTAL MSI stats\thit %u (%2.2f %%)\tmiss %u (%2.2f %%)\n", total_hit_cnt, (100.0f*total_hit_cnt)/(total_hit_cnt+total_miss_cnt), total_miss_cnt, (100.0f*total_miss_cnt)/(total_hit_cnt+total_miss_cnt));
+	fprintf(stream, "TOTAL MSI stats\thit %u (%2.2f %%)\tmiss %u (%2.2f %%)\n", total_hit_cnt, (100.0f*total_hit_cnt)/(total_hit_cnt+total_miss_cnt), total_miss_cnt, (100.0f*total_miss_cnt)/(total_hit_cnt+total_miss_cnt));
 
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 		if (hit_cnt[node]+miss_cnt[node])
 		{
-			fprintf(stderr, "memory node %u\n", node);
-			fprintf(stderr, "\thit : %u (%2.2f %%)\n", hit_cnt[node], (100.0f*hit_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
-			fprintf(stderr, "\tmiss : %u (%2.2f %%)\n", miss_cnt[node], (100.0f*miss_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
+			fprintf(stream, "memory node %u\n", node);
+			fprintf(stream, "\thit : %u (%2.2f %%)\n", hit_cnt[node], (100.0f*hit_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
+			fprintf(stream, "\tmiss : %u (%2.2f %%)\n", miss_cnt[node], (100.0f*miss_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
 		}
 	}
-	fprintf(stderr, "#---------------------\n");
+	fprintf(stream, "#---------------------\n");
 }
 
 /* measure the efficiency of our allocation cache */
@@ -91,25 +91,25 @@ void __starpu_data_allocation_inc_stats(unsigned node)
 	alloc_cnt[node]++;
 }
 
-void _starpu_display_alloc_cache_stats(void)
+void _starpu_display_alloc_cache_stats(FILE *stream)
 {
 	if (!starpu_enable_stats())
 		return;
 
-	fprintf(stderr, "\n#---------------------\n");
-	fprintf(stderr, "Allocation cache stats:\n");
+	fprintf(stream, "\n#---------------------\n");
+	fprintf(stream, "Allocation cache stats:\n");
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 		if (alloc_cnt[node])
 		{
-			fprintf(stderr, "memory node %u\n", node);
-			fprintf(stderr, "\ttotal alloc : %u\n", alloc_cnt[node]);
-			fprintf(stderr, "\tcached alloc: %u (%2.2f %%)\n",
+			fprintf(stream, "memory node %u\n", node);
+			fprintf(stream, "\ttotal alloc : %u\n", alloc_cnt[node]);
+			fprintf(stream, "\tcached alloc: %u (%2.2f %%)\n",
 				alloc_cache_hit_cnt[node], (100.0f*alloc_cache_hit_cnt[node])/(alloc_cnt[node]));
 		}
 		else
-			fprintf(stderr, "No allocation on node %u\n", node);
+			fprintf(stream, "No allocation on node %u\n", node);
 	}
-	fprintf(stderr, "#---------------------\n");
+	fprintf(stream, "#---------------------\n");
 }

+ 3 - 3
src/datawizard/datastats.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2015, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -45,7 +45,7 @@ void __starpu_msi_cache_miss(unsigned node);
 		__starpu_msi_cache_miss(node); \
 } while (0)
 
-void _starpu_display_msi_stats(void);
+void _starpu_display_msi_stats(FILE *stream);
 
 void __starpu_allocation_cache_hit(unsigned node STARPU_ATTRIBUTE_UNUSED);
 void __starpu_data_allocation_inc_stats(unsigned node STARPU_ATTRIBUTE_UNUSED);
@@ -60,6 +60,6 @@ void __starpu_data_allocation_inc_stats(unsigned node STARPU_ATTRIBUTE_UNUSED);
 		__starpu_data_allocation_inc_stats(node); \
 } while (0)
 
-void _starpu_display_alloc_cache_stats(void);
+void _starpu_display_alloc_cache_stats(FILE *stream);
 
 #endif // __DATASTATS_H__

+ 23 - 20
src/datawizard/memalloc.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2016  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -492,7 +492,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
 			/* in case there was nobody using that buffer, throw it
 			 * away after writing it back to main memory */
-			
+
 			/* choose the best target */
 			target = choose_target(handle, node);
 
@@ -767,7 +767,6 @@ restart:
 
 		if (mc->data->is_not_important && (mc->footprint == footprint))
 		{
-//			fprintf(stderr, "found a candidate ...\n");
 			/* Note: this may unlock mc_list! */
 			success = try_to_reuse_mem_chunk(mc, node, replicate, 1);
 		}
@@ -1518,7 +1517,7 @@ void _starpu_memchunk_dirty(struct _starpu_mem_chunk *mc, unsigned node)
 }
 
 #ifdef STARPU_MEMORY_STATS
-void _starpu_memory_display_stats_by_node(int node)
+void _starpu_memory_display_stats_by_node(FILE *stream, int node)
 {
 	_starpu_spin_lock(&mc_lock[node]);
 
@@ -1526,38 +1525,42 @@ void _starpu_memory_display_stats_by_node(int node)
 	{
 		struct _starpu_mem_chunk *mc;
 
-		fprintf(stderr, "#-------\n");
-		fprintf(stderr, "Data on Node #%d\n",node);
+		fprintf(stream, "#-------\n");
+		fprintf(stream, "Data on Node #%d\n",node);
 
 		for (mc = _starpu_mem_chunk_list_begin(&mc_list[node]);
 		     mc != _starpu_mem_chunk_list_end(&mc_list[node]);
 		     mc = _starpu_mem_chunk_list_next(mc))
 		{
 			if (mc->automatically_allocated == 0)
-				_starpu_memory_display_handle_stats(mc->data);
+				_starpu_memory_display_handle_stats(stream, mc->data);
 		}
 
 	}
 
 	_starpu_spin_unlock(&mc_lock[node]);
 }
-#endif
 
-void starpu_data_display_memory_stats(void)
+void _starpu_data_display_memory_stats(FILE *stream)
 {
-#ifdef STARPU_MEMORY_STATS
 	unsigned node;
 
-	fprintf(stderr, "\n#---------------------\n");
-	fprintf(stderr, "Memory stats :\n");
+	fprintf(stream, "\n#---------------------\n");
+	fprintf(stream, "Memory stats :\n");
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
-	     _starpu_memory_display_stats_by_node(node);
+		_starpu_memory_display_stats_by_node(stream, node);
 	}
-	fprintf(stderr, "\n#---------------------\n");
-#endif
+	fprintf(stream, "\n#---------------------\n");
 }
+#endif
 
+void starpu_data_display_memory_stats(void)
+{
+#ifdef STARPU_MEMORY_STATS
+	_starpu_data_display_memory_stats(stderr);
+#endif
+}
 
 static int
 get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
@@ -1566,7 +1569,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 	unsigned nnodes = starpu_memory_nodes_get_count();
 	unsigned int i;
 	double time_disk = 0;
-				
+
 	for (i = 0; i < nnodes; i++)
 	{
 		if (starpu_node_get_kind(i) == STARPU_DISK_RAM && i != node &&
@@ -1576,7 +1579,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 			/* if we can write on the disk */
 			if (_starpu_get_disk_flag(i) != STARPU_DISK_NO_RECLAIM)
 			{
-				/* only time can change between disk <-> main_ram 
+				/* only time can change between disk <-> main_ram
 				 * and not between main_ram <-> worker if we compare diks*/
 				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
 				if (target == -1 || time_disk > time_tmp)
@@ -1600,7 +1603,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		/* try to push on RAM if we can before to push on disk */
 		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && node != STARPU_MAIN_RAM)
 		{
-			if (handle->per_node[STARPU_MAIN_RAM].allocated || 
+			if (handle->per_node[STARPU_MAIN_RAM].allocated ||
 			    _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
 			{
 				target = STARPU_MAIN_RAM;
@@ -1612,7 +1615,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
 		}
           	/* others memory nodes */
-		else 
+		else
 		{
 			target = handle->home_node;
 		}
@@ -1626,7 +1629,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		}
 		/* node != 0 */
 		/* try to push data to RAM if we can before to push on disk*/
-		else if (handle->per_node[STARPU_MAIN_RAM].allocated || 
+		else if (handle->per_node[STARPU_MAIN_RAM].allocated ||
 			 _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
 		{
 			target = STARPU_MAIN_RAM;

+ 15 - 15
src/datawizard/memstats.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2012  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,31 +47,31 @@ void _starpu_memory_stats_free(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUS
 }
 
 #ifdef STARPU_MEMORY_STATS
-void _starpu_memory_display_handle_stats(starpu_data_handle_t handle)
+void _starpu_memory_display_handle_stats(FILE *stream, starpu_data_handle_t handle)
 {
 	unsigned node;
 
-	fprintf(stderr, "#-----\n");
-	fprintf(stderr, "Data : %p\n", handle);
-	fprintf(stderr, "Size : %d\n", (int)handle->ops->get_size(handle));
-	fprintf(stderr, "\n");
+	fprintf(stream, "#-----\n");
+	fprintf(stream, "Data : %p\n", handle);
+	fprintf(stream, "Size : %d\n", (int)handle->ops->get_size(handle));
+	fprintf(stream, "\n");
 
-	fprintf(stderr, "#--\n");
-	fprintf(stderr, "Data access stats\n");
-	fprintf(stderr, "/!\\ Work Underway\n");
+	fprintf(stream, "#--\n");
+	fprintf(stream, "Data access stats\n");
+	fprintf(stream, "/!\\ Work Underway\n");
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 		if (handle->memory_stats->direct_access[node]+handle->memory_stats->loaded_shared[node]
 		    +handle->memory_stats->invalidated[node]+handle->memory_stats->loaded_owner[node])
 		{
-			fprintf(stderr, "Node #%u\n", node);
-			fprintf(stderr, "\tDirect access : %d\n", handle->memory_stats->direct_access[node]);
+			fprintf(stream, "Node #%u\n", node);
+			fprintf(stream, "\tDirect access : %d\n", handle->memory_stats->direct_access[node]);
 			/* XXX Not Working yet. */
 			if (handle->memory_stats->shared_to_owner[node])
-				fprintf(stderr, "\t\tShared to Owner : %d\n", handle->memory_stats->shared_to_owner[node]);
-			fprintf(stderr, "\tLoaded (Owner) : %d\n", handle->memory_stats->loaded_owner[node]);
-			fprintf(stderr, "\tLoaded (Shared) : %d\n", handle->memory_stats->loaded_shared[node]);
-			fprintf(stderr, "\tInvalidated (was Owner) : %d\n\n", handle->memory_stats->invalidated[node]);
+				fprintf(stream, "\t\tShared to Owner : %d\n", handle->memory_stats->shared_to_owner[node]);
+			fprintf(stream, "\tLoaded (Owner) : %d\n", handle->memory_stats->loaded_owner[node]);
+			fprintf(stream, "\tLoaded (Shared) : %d\n", handle->memory_stats->loaded_shared[node]);
+			fprintf(stream, "\tInvalidated (was Owner) : %d\n\n", handle->memory_stats->invalidated[node]);
 		}
 	}
 }

+ 2 - 2
src/datawizard/memstats.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,7 +42,7 @@ void _starpu_memory_stats_init_per_node(starpu_data_handle_t handle, unsigned no
 
 void _starpu_memory_stats_free(starpu_data_handle_t handle);
 
-void _starpu_memory_display_handle_stats(starpu_data_handle_t handle);
+void _starpu_memory_display_handle_stats(FILE *stream, starpu_data_handle_t handle);
 
 void _starpu_memory_handle_stats_cache_hit(starpu_data_handle_t handle, unsigned node);
 void _starpu_memory_handle_stats_loaded_shared(starpu_data_handle_t handle, unsigned node);

+ 2 - 1
src/debug/starpu_debug_helpers.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -30,7 +31,7 @@ extern "C"
 void _starpu_benchmark_ping_pong(starpu_data_handle_t handle, unsigned node0, unsigned node1, unsigned niter);
 
 /* Display the size of different data structures */
-void _starpu_debug_display_structures_size(void);
+void _starpu_debug_display_structures_size(FILE *stream);
 
 #ifdef __cplusplus
 }

+ 8 - 7
src/debug/structures_size.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux
+ * Copyright (C) 2017        CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,18 +22,18 @@
 #include <profiling/bound.h>
 #include <debug/starpu_debug_helpers.h>
 
-void _starpu_debug_display_structures_size(void)
+void _starpu_debug_display_structures_size(FILE *stream)
 {
-	fprintf(stderr, "struct starpu_task\t\t%u bytes\t(%x)\n",
+	fprintf(stream, "struct starpu_task\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct starpu_task), (unsigned) sizeof(struct starpu_task));
-	fprintf(stderr, "struct _starpu_job\t\t%u bytes\t(%x)\n",
+	fprintf(stream, "struct _starpu_job\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct _starpu_job), (unsigned) sizeof(struct _starpu_job));
-	fprintf(stderr, "struct _starpu_data_state\t%u bytes\t(%x)\n",
+	fprintf(stream, "struct _starpu_data_state\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct _starpu_data_state), (unsigned) sizeof(struct _starpu_data_state));
-	fprintf(stderr, "struct _starpu_tag\t\t%u bytes\t(%x)\n",
+	fprintf(stream, "struct _starpu_tag\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct _starpu_tag), (unsigned) sizeof(struct _starpu_tag));
-	fprintf(stderr, "struct _starpu_cg\t\t%u bytes\t(%x)\n",
+	fprintf(stream, "struct _starpu_cg\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct _starpu_cg), (unsigned) sizeof(struct _starpu_cg));
-	fprintf(stderr, "struct _starpu_worker\t\t%u bytes\t(%x)\n",
+	fprintf(stream, "struct _starpu_worker\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct _starpu_worker), (unsigned) sizeof(struct _starpu_worker));
 }

+ 8 - 8
src/debug/traces/starpu_fxt.c

@@ -2213,7 +2213,7 @@ static void handle_mpi_isend_submit_end(struct fxt_ev_64 *ev, struct starpu_fxt_
 	{
 		if (!mpi_warned)
 		{
-			fprintf(stderr,"Warning : Only one trace file is given. MPI transfers will not be displayed. Add all trace files to show them ! \n");
+			_STARPU_MSG("Warning : Only one trace file is given. MPI transfers will not be displayed. Add all trace files to show them ! \n");
 			mpi_warned = 1;
 		}
 	}
@@ -2276,7 +2276,7 @@ static void handle_mpi_irecv_complete_begin(struct fxt_ev_64 *ev, struct starpu_
 	{
 		if (!mpi_warned)
 		{
-			fprintf(stderr,"Warning : Only one trace file is given. MPI transfers will not be displayed. Add all trace files to show them ! \n");
+			_STARPU_MSG("Warning : Only one trace file is given. MPI transfers will not be displayed. Add all trace files to show them ! \n");
 			mpi_warned = 1;
 		}
 	}
@@ -3037,8 +3037,8 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
 			default:
 #ifdef STARPU_VERBOSE
-				fprintf(stderr, "unknown event.. %x at time %llx WITH OFFSET %llx\n",
-					(unsigned)ev.code, (long long unsigned)ev.time, (long long unsigned)(ev.time-options->file_offset));
+				_STARPU_MSG("unknown event.. %x at time %llx WITH OFFSET %llx\n",
+					    (unsigned)ev.code, (long long unsigned)ev.time, (long long unsigned)(ev.time-options->file_offset));
 #endif
 				break;
 		}
@@ -3195,7 +3195,7 @@ void _starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 		out_paje_file = fopen(options->out_paje_path, "w+");
 		if (!out_paje_file)
 		{
-			fprintf(stderr,"error while opening %s\n", options->out_paje_path);
+			_STARPU_MSG("error while opening %s\n", options->out_paje_path);
 			perror("fopen");
 			exit(1);
 		}
@@ -3343,7 +3343,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 				{
 					if (key != unique_keys[inputfile])
 					{
-						fprintf(stderr, "Warning: traces are coming from different run so we will not try to display MPI communications.\n");
+						_STARPU_MSG("Warning: traces are coming from different run so we will not try to display MPI communications.\n");
 						display_mpi = 0;
 					}
 				}
@@ -3523,8 +3523,8 @@ void starpu_fxt_write_data_trace(char *filename_in)
 
 		default:
 #ifdef STARPU_VERBOSE
-			fprintf(stderr, "unknown event.. %x at time %llx WITH OFFSET %llx\n",
-				(unsigned)ev.code, (long long unsigned)ev.time, (long long unsigned)(ev.time));
+			_STARPU_MSG("unknown event.. %x at time %llx WITH OFFSET %llx\n",
+				    (unsigned)ev.code, (long long unsigned)ev.time, (long long unsigned)(ev.time));
 #endif
 			break;
 		}

+ 1 - 1
src/debug/traces/starpu_fxt_dag.c

@@ -37,7 +37,7 @@ void _starpu_fxt_dag_init(char *out_path)
 	out_file = fopen(out_path, "w+");
 	if (!out_file)
 	{
-		fprintf(stderr,"error while opening %s\n", out_path);
+		_STARPU_MSG("error while opening %s\n", out_path);
 		perror("fopen");
 		exit(1);
 	}

+ 2 - 2
src/debug/traces/starpu_fxt_mpi.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012-2013, 2016  Université Bordeaux
- * Copyright (C) 2010, 2011, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2014, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -70,7 +70,7 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 		int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
 		if (ret != FXT_EV_OK)
 		{
-			fprintf(stderr, "no more block ...\n");
+			_STARPU_MSG("no more block ...\n");
 			break;
 		}
 

+ 6 - 6
src/drivers/cuda/driver_cuda.c

@@ -227,13 +227,13 @@ void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	if (conf->n_cuda_opengl_interoperability)
 	{
-		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
+		_STARPU_MSG("OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
 		STARPU_ABORT();
 	}
 #elif !defined(HAVE_CUDA_GL_INTEROP_H)
 	if (conf->n_cuda_opengl_interoperability)
 	{
-		fprintf(stderr,"OpenGL interoperability was requested, but cuda_gl_interop.h could not be compiled, please make sure that OpenGL headers were available before ./configure run.");
+		_STARPU_MSG("OpenGL interoperability was requested, but cuda_gl_interop.h could not be compiled, please make sure that OpenGL headers were available before ./configure run.");
 		STARPU_ABORT();
 	}
 #else
@@ -324,7 +324,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 	{
 		if (cures == cudaErrorDevicesUnavailable)
 		{
-			fprintf(stderr,"All CUDA-capable devices are busy or unavailable\n");
+			_STARPU_MSG("All CUDA-capable devices are busy or unavailable\n");
 			exit(77);
 		}
 		STARPU_CUDA_REPORT_ERROR(cures);
@@ -336,7 +336,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	if (props[devid].computeMode == cudaComputeModeExclusive)
 	{
-		fprintf(stderr, "CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
+		_STARPU_MSG("CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
 		STARPU_ABORT();
 	}
 #endif
@@ -447,7 +447,7 @@ unsigned _starpu_get_cuda_device_count(void)
 
 	if (cnt > STARPU_MAXCUDADEVS)
 	{
-		fprintf(stderr, "# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
+		_STARPU_MSG("# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
 		cnt = STARPU_MAXCUDADEVS;
 	}
 	return (unsigned)cnt;
@@ -973,7 +973,7 @@ void starpu_cublas_report_error(const char *func, const char *file, int line, in
 			errormsg = "unknown error";
 			break;
 	}
-	fprintf(stderr, "oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
+	_STARPU_MSG("oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
 	STARPU_ABORT();
 }
 

+ 4 - 4
src/drivers/scc/driver_scc_common.c

@@ -47,9 +47,9 @@ static void _starpu_scc_set_src_node_id()
 		else if (RCCE_ue() == 0)
 		{
 			/* Only node 0 print the error message. */
-			fprintf(stderr, "The node you specify to be the master is "
-					"greater than the total number of nodes.\n"
-					"Taking node 0 (core %d) by default...\n", RC_COREID[0]);
+			_STARPU_MSG("The node you specify to be the master is "
+				    "greater than the total number of nodes.\n"
+				    "Taking node 0 (core %d) by default...\n", RC_COREID[0]);
 		}
 	}
 
@@ -173,7 +173,7 @@ void _starpu_scc_common_report_rcce_error(const char *func, const char *file, co
 
 	RCCE_error_string(err_no, error_string, &error_string_length); 
 
-	fprintf(stderr, "RCCE error in %s (%s:%d): %s\n", func, file, line, error_string); 
+	_STARPU_MSG("RCCE error in %s (%s:%d): %s\n", func, file, line, error_string); 
 	STARPU_ABORT();
 }
 

+ 1 - 1
src/drivers/scc/driver_scc_sink.c

@@ -145,7 +145,7 @@ void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int
 			case STARPU_CSR_INTERFACE_ID:
 			case STARPU_BCSR_INTERFACE_ID:
 			case STARPU_MULTIFORMAT_INTERFACE_ID:
-			fprintf(stderr, "Data type not supported on SCC.\n");
+				_STARPU_MSG("Data type not supported on SCC.\n");
 
 			default:
 				STARPU_ABORT();

+ 2 - 2
src/drivers/scc/driver_scc_source.c

@@ -239,8 +239,8 @@ void _starpu_scc_set_offset_in_shared_memory(void *ptr, void **dev_handle, size_
 	{
 		if (!_starpu_scc_common_is_in_shared_memory(ptr))
 		{
-			fprintf(stderr, "The data (%p) you want to register does not seem to be allocated in shared memory. "
-					"Please use starpu_malloc to do this.\n", ptr);
+			_STARPU_MSG("The data (%p) you want to register does not seem to be allocated in shared memory. "
+				    "Please use starpu_malloc to do this.\n", ptr);
 			STARPU_ABORT();
 		}
 

+ 6 - 6
src/profiling/bound.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -397,7 +397,7 @@ void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j
 	dep_t = find_job(id);
 	if (!dep_t)
 	{
-		fprintf(stderr,"dependency %lu not found !\n", id);
+		_STARPU_MSG("dependency %lu not found !\n", id);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		return;
 	}
@@ -520,7 +520,7 @@ void starpu_bound_print_lp(FILE *output)
 			if (t1->cl->model->type != STARPU_HISTORY_BASED &&
 			    t1->cl->model->type != STARPU_NL_REGRESSION_BASED)
 				/* TODO: */
-				fprintf(stderr, "Warning: task %s uses a perf model which is neither history nor non-linear regression-based, support for such model is not implemented yet, system will not be solvable.\n", _starpu_codelet_get_model_name(t1->cl));
+				_STARPU_MSG("Warning: task %s uses a perf model which is neither history nor non-linear regression-based, support for such model is not implemented yet, system will not be solvable.\n", _starpu_codelet_get_model_name(t1->cl));
 
 			struct _starpu_job j =
 			{
@@ -824,7 +824,7 @@ void starpu_bound_print_lp(FILE *output)
 				for (w = 0; w < nw; w++)
 				{
 					if (isnan(times[w*nt+t]))
-						fprintf(stderr, "Warning: task %s has no performance measurement for worker %d.\n", _starpu_codelet_get_model_name(tp->cl), w);
+						_STARPU_MSG("Warning: task %s has no performance measurement for worker %d.\n", _starpu_codelet_get_model_name(tp->cl), w);
 					else
 					{
 						got_one = 1;
@@ -833,7 +833,7 @@ void starpu_bound_print_lp(FILE *output)
 				}
 				fprintf(output, " = %lu;\n", tp->n);
 				if (!got_one)
-					fprintf(stderr, "Warning: task %s has no performance measurement for any worker, system will not be solvable!\n", _starpu_codelet_get_model_name(tp->cl));
+					_STARPU_MSG("Warning: task %s has no performance measurement for any worker, system will not be solvable!\n", _starpu_codelet_get_model_name(tp->cl));
 				/* Show actual values */
 				fprintf(output, "/*");
 				for (w = 0; w < nw; w++)
@@ -1133,7 +1133,7 @@ void starpu_bound_print(FILE *output, int integer STARPU_ATTRIBUTE_UNUSED)
 	}
 	else
 	{
-		fprintf(stderr, "Simplex failed\n");
+		_STARPU_MSG("Simplex failed\n");
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 #else /* STARPU_HAVE_GLPK_H */

+ 34 - 26
src/profiling/profiling_helpers.c

@@ -31,17 +31,14 @@ static double convert_to_byte_units(float d, unsigned max_unit, unsigned *unit)
 	return d;
 }
 
-void starpu_profiling_bus_helper_display_summary(void)
+void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 {
-	const char *stats;
 	int long long sum_transferred = 0;
 	const char *byte_units[] = { "B", "KB", "MB", "GB", "TB" };
 	unsigned max_unit = sizeof(byte_units) / sizeof(byte_units[0]);
 
-	if (!((stats = starpu_getenv("STARPU_BUS_STATS")) && atoi(stats))) return;
-
-	fprintf(stderr, "\n#---------------------\n");
-	fprintf(stderr, "Data transfer stats:\n");
+	fprintf(stream, "\n#---------------------\n");
+	fprintf(stream, "Data transfer stats:\n");
 
 	int busid;
 	int bus_cnt = starpu_bus_get_count();
@@ -66,10 +63,10 @@ void starpu_profiling_bus_helper_display_summary(void)
 		_starpu_memory_node_get_name(src, src_name, sizeof(src_name));
 		_starpu_memory_node_get_name(dst, dst_name, sizeof(dst_name));
 
-		fprintf(stderr, "\t%s -> %s", src_name, dst_name);
-		fprintf(stderr, "\t%.2lf %s", d, byte_units[unit]);
-		fprintf(stderr, "\t%.2lf %s/s", d / elapsed_time, byte_units[unit]);
-		fprintf(stderr, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, d / transfer_cnt, byte_units[unit]);
+		fprintf(stream, "\t%s -> %s", src_name, dst_name);
+		fprintf(stream, "\t%.2lf %s", d, byte_units[unit]);
+		fprintf(stream, "\t%.2lf %s/s", d / elapsed_time, byte_units[unit]);
+		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, d / transfer_cnt, byte_units[unit]);
 
 		sum_transferred += transferred;
 	}
@@ -77,23 +74,27 @@ void starpu_profiling_bus_helper_display_summary(void)
 	unsigned unit = 0;
 	double d = convert_to_byte_units(sum_transferred, max_unit, &unit);
 
-	fprintf(stderr, "Total transfers: %.2lf %s\n", d, byte_units[unit]);
-	fprintf(stderr, "#---------------------\n");
+	fprintf(stream, "Total transfers: %.2lf %s\n", d, byte_units[unit]);
+	fprintf(stream, "#---------------------\n");
 }
 
-void starpu_profiling_worker_helper_display_summary(void)
+void starpu_profiling_bus_helper_display_summary(void)
 {
 	const char *stats;
+	if (!((stats = starpu_getenv("STARPU_BUS_STATS")) && atoi(stats))) return;
+	_starpu_profiling_bus_helper_display_summary(stderr);
+}
+
+void _starpu_profiling_worker_helper_display_summary(FILE *stream)
+{
 	double sum_consumed = 0.;
 	int profiling = starpu_profiling_status_get();
 	double overall_time = 0;
 	int workerid;
 	int worker_cnt = starpu_worker_get_count();
 
-	if (!((stats = starpu_getenv("STARPU_WORKER_STATS")) && atoi(stats))) return;
-
-	fprintf(stderr, "\n#---------------------\n");
-	fprintf(stderr, "Worker stats:\n");
+	fprintf(stream, "\n#---------------------\n");
+	fprintf(stream, "Worker stats:\n");
 
 	for (workerid = 0; workerid < worker_cnt; workerid++)
 	{
@@ -103,8 +104,8 @@ void starpu_profiling_worker_helper_display_summary(void)
 
 		starpu_worker_get_name(workerid, name, sizeof(name));
 
-		fprintf(stderr, "%-32s\n", name);
-		fprintf(stderr, "\t%d task(s)\n", info.executed_tasks);
+		fprintf(stream, "%-32s\n", name);
+		fprintf(stream, "\t%d task(s)\n", info.executed_tasks);
 
 		if (profiling)
 		{
@@ -114,14 +115,14 @@ void starpu_profiling_worker_helper_display_summary(void)
 			if (total_time > overall_time)
 				overall_time = total_time;
 
-			fprintf(stderr, "\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf ms overhead %.2lf ms\n",
+			fprintf(stream, "\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf ms overhead %.2lf ms\n",
 				total_time, executing_time, sleeping_time, total_time - executing_time - sleeping_time);
 			if (info.used_cycles || info.stall_cycles)
-				fprintf(stderr, "\t%llu Mcy %llu Mcy stall\n", (unsigned long long)info.used_cycles/1000000, (unsigned long long)info.stall_cycles/1000000);
+				fprintf(stream, "\t%llu Mcy %llu Mcy stall\n", (unsigned long long)info.used_cycles/1000000, (unsigned long long)info.stall_cycles/1000000);
 			if (info.energy_consumed)
-				fprintf(stderr, "\t%f J consumed\n", info.energy_consumed);
+				fprintf(stream, "\t%f J consumed\n", info.energy_consumed);
 			if (info.flops)
-				fprintf(stderr, "\t%f GFlop/s\n\n", info.flops / total_time / 1000000);
+				fprintf(stream, "\t%f GFlop/s\n\n", info.flops / total_time / 1000000);
 		}
 
 		sum_consumed += info.energy_consumed;
@@ -135,10 +136,17 @@ void starpu_profiling_worker_helper_display_summary(void)
 			double idle_power = atof(strval_idle_power); /* Watt */
 			double idle_energy = idle_power * overall_time / 1000.; /* J */
 
-			fprintf(stderr, "Idle energy: %.2lf J\n", idle_energy);
-			fprintf(stderr, "Total energy: %.2lf J\n",
+			fprintf(stream, "Idle energy: %.2lf J\n", idle_energy);
+			fprintf(stream, "Total energy: %.2lf J\n",
 				sum_consumed + idle_energy);
 		}
 	}
-	fprintf(stderr, "#---------------------\n");
+	fprintf(stream, "#---------------------\n");
+}
+
+void starpu_profiling_worker_helper_display_summary(void)
+{
+	const char *stats;
+	if (!((stats = starpu_getenv("STARPU_WORKER_STATS")) && atoi(stats))) return;
+	_starpu_profiling_worker_helper_display_summary(stderr);
 }

+ 4 - 5
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011-2012, 2016  INRIA
  * Copyright (C) 2016  Uppsala University
@@ -111,10 +111,9 @@ static void param_modified(struct starpu_top_param* d)
 #warning FIXME: get sched ctx to get alpha/beta/gamma/idle values
 #endif
 	/* Just to show parameter modification. */
-	fprintf(stderr,
-		"%s has been modified : "
-		"alpha=%f|beta=%f|gamma=%f|idle_power=%f !\n",
-		d->name, alpha,beta,_gamma, idle_power);
+	_STARPU_MSG("%s has been modified : "
+		    "alpha=%f|beta=%f|gamma=%f|idle_power=%f !\n",
+		    d->name, alpha,beta,_gamma, idle_power);
 }
 #endif /* !STARPU_USE_TOP */
 

+ 1 - 2
src/sched_policies/helper_mct.c

@@ -32,8 +32,7 @@
 static void param_modified(struct starpu_top_param* d)
 {
 	/* Just to show parameter modification. */
-	fprintf(stderr, "%s has been modified : %f\n",
-			d->name, *(double*) d->value);
+	_STARPU_MSG("%s has been modified : %f\n", d->name, *(double*) d->value);
 }
 #endif /* !STARPU_USE_TOP */
 

+ 2 - 2
src/sched_policies/parallel_eager.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2011-2016  Université de Bordeaux
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011-2013  INRIA
- * Copyright (C) 2016       CNRS
+ * Copyright (C) 2016, 2017       CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -100,7 +100,7 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
         {
 		workerid = workerids[i];
 
-		fprintf(stderr, "MASTER of %d = %d\n", workerid, master_id[workerid]);
+		_STARPU_MSG("MASTER of %d = %d\n", workerid, master_id[workerid]);
 	}
 #endif
 }

+ 1 - 1
src/sched_policies/scheduler_maker.c

@@ -265,7 +265,7 @@ struct starpu_sched_tree * starpu_sched_component_make_scheduler(unsigned sched_
 
 	starpu_sched_tree_update_workers(tree);
 #ifdef STARPU_DEVEL
-	fprintf(stderr, "scheduler created :\n");
+	_STARPU_MSG("scheduler created :\n");
 	helper_display_scheduler(stderr, 0, tree->root);
 #endif
 

+ 6 - 6
src/util/openmp_runtime_support_environment.c

@@ -236,7 +236,7 @@ static void read_wait_policy_var()
 	ret = read_string_var(env, strings, &value);
 	if (!ret)
 	{
-		fprintf(stderr, "StarPU: Invalid value for environment variable OMP_WAIT_POLICY\n");
+		_STARPU_MSG("StarPU: Invalid value for environment variable OMP_WAIT_POLICY\n");
 		return;
 	}
 	_initial_icv_values.wait_policy_var = value;
@@ -256,7 +256,7 @@ static void read_display_env_var(int *dest)
 	ret = read_string_var(env, strings, &value);
 	if (!ret)
 	{
-		fprintf(stderr, "StarPU: Invalid value for environment variable OMP_DISPLAY_ENV\n");
+		_STARPU_MSG("StarPU: Invalid value for environment variable OMP_DISPLAY_ENV\n");
 		return;
 	}
 
@@ -559,7 +559,7 @@ static void read_proc_bind_var()
 
 			if (!read_string_var(token, strings, &value))
 			{
-				fprintf(stderr, "StarPU: Invalid value for environment variable OMP_PROC_BIND\n");
+				_STARPU_MSG("StarPU: Invalid value for environment variable OMP_PROC_BIND\n");
 				break;
 			}
 
@@ -590,7 +590,7 @@ static void read_num_threads_var()
 
 			if (!read_int_var(token, &value))
 			{
-				fprintf(stderr, "StarPU: Invalid value for environment variable OMP_NUM_THREADS\n");
+				_STARPU_MSG("StarPU: Invalid value for environment variable OMP_NUM_THREADS\n");
 				break;
 			}
 
@@ -613,7 +613,7 @@ static void read_omp_int_var(const char *name, int *icv)
 	ret = read_int_var(env, &value);
 	if (!ret || value < 0)
 	{
-		fprintf(stderr, "StarPU: Invalid value for environment variable %s\n", name);
+		_STARPU_MSG("StarPU: Invalid value for environment variable %s\n", name);
 		return;
 	}
 	*icv = value;
@@ -632,7 +632,7 @@ static void read_omp_boolean_var(const char *name, int *icv)
 	ret = read_string_var(env, strings, &value);
 	if (!ret)
 	{
-		fprintf(stderr, "StarPU: Invalid value for environment variable %s\n", name);
+		_STARPU_MSG("StarPU: Invalid value for environment variable %s\n", name);
 		return;
 	}
 	*icv = value;

+ 5 - 5
src/util/starpu_task_insert.c

@@ -141,11 +141,11 @@ int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
-		fprintf(stderr, "submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
-			task, task->cl,
-			(cl == NULL) ? "none" :
-			task->cl->name ? task->cl->name :
-			(task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
+		_STARPU_MSG("submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
+			    task, task->cl,
+			    (cl == NULL) ? "none" :
+			    task->cl->name ? task->cl->name :
+			    (task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
 
 		task->destroy = 0;
 		starpu_task_destroy(task);

+ 5 - 1
tests/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2017  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
 # Copyright (C) 2010, 2011, 2012  INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -157,6 +157,7 @@ myPROGRAMS +=				\
 	microbenchs/display_structures_size	\
 	microbenchs/local_pingpong		\
 	sched_ctx/sched_ctx_list		\
+	sched_ctx/sched_ctx_hierarchy		\
 	perfmodels/value_nan
 
 if !STARPU_SIMGRID
@@ -579,6 +580,9 @@ endif
 sched_ctx_sched_ctx_list_SOURCES =	\
 	sched_ctx/sched_ctx_list.c
 
+sched_ctx_sched_ctx_hierarchy_SOURCES =	\
+	sched_ctx/sched_ctx_hierarchy.c
+
 openmp_init_exit_01_SOURCES = 	\
 	openmp/init_exit_01.c
 

+ 2 - 1
tests/microbenchs/display_structures_size.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2016  Université de Bordeaux
+ * Copyright (C) 2017        CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,7 +24,7 @@
 
 int main(int argc, char **argv)
 {
-	_starpu_debug_display_structures_size();
+	_starpu_debug_display_structures_size(stderr);
 
 	return EXIT_SUCCESS;
 }

+ 162 - 0
tests/sched_ctx/sched_ctx_hierarchy.c

@@ -0,0 +1,162 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <starpu.h>
+#include "../helper.h"
+
+struct starpu_codelet mycodelet_bis;
+void func_cpu_bis(void *descr[], void *_args)
+{
+	char msg;
+	char worker_name[256];
+	int worker_id = starpu_worker_get_id();
+	int worker_id_expected;
+	int ntasks;
+
+	starpu_worker_get_name(worker_id, worker_name, 256);
+	starpu_codelet_unpack_args(_args, &msg, &ntasks, &worker_id_expected);
+
+	STARPU_ASSERT(worker_id == worker_id_expected);
+
+	FPRINTF(stderr, "[msg '%c'] [worker id %d] [worker name %s] [tasks %d]\n", msg, worker_id, worker_name, ntasks);
+	if (ntasks > 0)
+	{
+		int nntasks = ntasks - 1;
+		starpu_task_insert(&mycodelet_bis,
+				   STARPU_VALUE, &msg, sizeof(msg),
+				   STARPU_VALUE, &nntasks, sizeof(ntasks),
+				   STARPU_VALUE, &worker_id, sizeof(worker_id),
+				   0);
+	}
+}
+
+struct starpu_codelet mycodelet_bis =
+{
+	.cpu_funcs = {func_cpu_bis},
+	.cpu_funcs_name = {"func_cpu_bis"},
+};
+
+void func_cpu(void *descr[], void *_args)
+{
+	char msg;
+	char worker_name[256];
+	int worker_id = starpu_worker_get_id();
+	int worker_id_expected;
+	int ntasks;
+	unsigned sched_ctx_id;
+	unsigned *sched_ctx_id_p;
+
+	starpu_worker_get_name(worker_id, worker_name, 256);
+	starpu_codelet_unpack_args(_args, &msg, &ntasks, &sched_ctx_id, &worker_id_expected, &sched_ctx_id_p);
+
+	STARPU_ASSERT(worker_id == worker_id_expected);
+
+	*sched_ctx_id_p = sched_ctx_id;
+	starpu_sched_ctx_set_context(sched_ctx_id_p);
+
+	FPRINTF(stderr, "[msg '%c'] [worker id %d] [worker name %s] [sched_ctx_id %u] [tasks %d] [buffer %p]\n", msg, worker_id, worker_name, sched_ctx_id, ntasks, sched_ctx_id_p);
+	if (ntasks > 0)
+	{
+		int nntasks = ntasks - 1;
+		starpu_task_insert(&mycodelet_bis,
+				   STARPU_VALUE, &msg, sizeof(msg),
+				   STARPU_VALUE, &nntasks, sizeof(nntasks),
+				   STARPU_VALUE, &worker_id, sizeof(worker_id),
+				   0);
+	}
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.cpu_funcs_name = {"func_cpu"},
+};
+
+int main(int argc, char **argv)
+{
+        int i, ret;
+	int nprocs, nprocs_per_context=1;
+        int procs[STARPU_NMAXWORKERS];
+	int ntasks=10;
+	char msg[2] = "ab";
+	unsigned *buffer[2];
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+        nprocs = starpu_cpu_worker_get_count();
+	if (nprocs < 2) goto enodev;
+
+	nprocs_per_context = 1;
+	FPRINTF(stderr, "# Workers = %d -> %d worker for each sched context\n", nprocs, nprocs_per_context);
+        starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs, nprocs);
+
+	unsigned sched_ctx_0 = starpu_sched_ctx_create(procs, nprocs_per_context, "ctx_0", 0);
+	unsigned sched_ctx_1 = starpu_sched_ctx_create(&procs[nprocs_per_context], nprocs_per_context, "ctx_1", 0);
+
+	if (!getenv("STARPU_SSILENT"))
+	{
+		char name0[256];
+		char name1[256];
+
+		starpu_worker_get_name(procs[0], name0, 256);
+		starpu_worker_get_name(procs[1], name1, 256);
+
+		FPRINTF(stderr, "Creating first sched_ctx with %d worker [id %d name %s]\n", nprocs_per_context, procs[0], name0);
+		FPRINTF(stderr, "Creating second sched_ctx with %d worker [id %d name %s]\n", nprocs_per_context, procs[1], name1);
+
+		starpu_sched_ctx_display_workers(sched_ctx_0, stderr);
+		starpu_sched_ctx_display_workers(sched_ctx_1, stderr);
+	}
+
+	buffer[0] = malloc(sizeof(unsigned));
+	buffer[1] = malloc(sizeof(unsigned));
+	FPRINTF(stderr, "allocating %p and %p\n", buffer[0], buffer[1]);
+
+	ret = starpu_task_insert(&mycodelet, STARPU_SCHED_CTX, sched_ctx_0,
+				 STARPU_VALUE, &msg[0], sizeof(msg[0]),
+				 STARPU_VALUE, &ntasks, sizeof(ntasks),
+				 STARPU_VALUE, &sched_ctx_0, sizeof(sched_ctx_0),
+				 STARPU_VALUE, &procs[0], sizeof(procs[0]),
+				 STARPU_VALUE, &buffer[0], sizeof(buffer[0]),
+				 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+	ret = starpu_task_insert(&mycodelet, STARPU_SCHED_CTX, sched_ctx_1,
+				 STARPU_VALUE, &msg[1], sizeof(msg[1]),
+				 STARPU_VALUE, &ntasks, sizeof(ntasks),
+				 STARPU_VALUE, &sched_ctx_1, sizeof(sched_ctx_1),
+				 STARPU_VALUE, &procs[1], sizeof(procs[1]),
+				 STARPU_VALUE, &buffer[1], sizeof(buffer[1]),
+				 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+	starpu_task_wait_for_all();
+	starpu_sched_ctx_delete(sched_ctx_0);
+	starpu_sched_ctx_delete(sched_ctx_1);
+	starpu_shutdown();
+	free(buffer[0]);
+	free(buffer[1]);
+	return 0;
+
+enodev:
+	starpu_shutdown();
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	return STARPU_TEST_SKIPPED;
+}

+ 4 - 4
tools/cppcheck/suppressions.txt

@@ -28,8 +28,8 @@ unusedPrivateFunction:tests/main/combined_workers/bfs/timer.h:45
 redundantAssignment:tests/main/driver_api/init_run_deinit.c
 redundantAssignment:tests/main/driver_api/run_driver.c
 
-uselessAssignmentPtrArg:mpi/src/starpu_mpi.c:155
-unreadVariable:mpi/src/starpu_mpi.c:849
+uselessAssignmentPtrArg:mpi/src/starpu_mpi.c:171
+unreadVariable:mpi/src/starpu_mpi.c:945
 redundantAssignment:src/core/workers.c
 
 invalidPointerCast:src/core/perfmodel/perfmodel_nan.c:74
@@ -42,20 +42,20 @@ unusedStructMember:src/core/perfmodel/perfmodel_bus.c:65
 unusedStructMember:src/core/perfmodel/perfmodel_bus.c:66
 unusedStructMember:src/core/simgrid.c:225
 unusedStructMember:src/core/simgrid.c:226
+wrongPrintfScanfArgNum:src/core/simgrid.c:719
 duplicateExpression:src/util/starpu_task_insert.c:52
 
 // TODO: this could be an error?
 redundantCopy:src/core/disk_ops/disk_leveldb.cpp:192
 
 nullPointerRedundantCheck:src/common/rbtree.c
-wrongPrintfScanfArgNum:src/core/simgrid.c:715
 unreadVariable:src/datawizard/interfaces/*
 unreadVariable:src/drivers/driver_common/driver_common.c:482
 clarifyCondition:src/drivers/opencl/driver_opencl.c:936
 unreadVariable:src/drivers/opencl/driver_opencl.c:767
 clarifyCondition:src/drivers/cuda/driver_cuda.c:506
 arithOperationsOnVoidPointer:src/drivers/scc/*
-nullPointerRedundantCheck:src/sched_policies/deque_modeling_policy_data_aware.c:197
+nullPointerRedundantCheck:src/sched_policies/deque_modeling_policy_data_aware.c:196
 sizeofDereferencedVoidPointer:src/util/fstarpu.c
 
 allocaCalled:gcc-plugin/src/*