Corentin Salingue 12 lat temu
rodzic
commit
631853b887
37 zmienionych plików z 187 dodań i 115 usunięć
  1. 2 3
      sc_hypervisor/include/sc_hypervisor_policy.h
  2. 1 1
      sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c
  3. 1 1
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  4. 1 3
      sc_hypervisor/src/hypervisor_policies/idle_policy.c
  5. 1 1
      sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  6. 1 1
      sc_hypervisor/src/hypervisor_policies/ispeed_policy.c
  7. 1 1
      sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c
  8. 25 3
      sc_hypervisor/src/policies_utils/policy_tools.c
  9. 31 13
      sc_hypervisor/src/sc_hypervisor.c
  10. 15 0
      sc_hypervisor/src/sc_hypervisor_intern.h
  11. 1 1
      src/common/utils.c
  12. 3 3
      src/core/combined_workers.c
  13. 1 1
      src/core/dependencies/tags.c
  14. 1 1
      src/core/jobs.c
  15. 1 1
      src/core/perfmodel/perfmodel_history.c
  16. 2 2
      src/core/sched_ctx.c
  17. 1 1
      src/core/simgrid.c
  18. 3 3
      src/core/task.c
  19. 4 9
      src/core/topology.c
  20. 1 1
      src/core/workers.c
  21. 2 4
      src/core/workers.h
  22. 1 1
      src/datawizard/data_request.c
  23. 7 6
      src/datawizard/filters.c
  24. 5 5
      src/datawizard/interfaces/data_interface.c
  25. 1 1
      src/datawizard/malloc.c
  26. 58 32
      src/datawizard/memalloc.c
  27. 5 5
      src/datawizard/reduction.c
  28. 1 1
      src/datawizard/user_interactions.c
  29. 1 1
      src/debug/traces/starpu_fxt.c
  30. 1 1
      src/drivers/cpu/driver_cpu.c
  31. 1 1
      src/drivers/cuda/driver_cuda.c
  32. 1 1
      src/drivers/mic/driver_mic_source.c
  33. 1 1
      src/drivers/mp_common/source_common.c
  34. 1 1
      src/drivers/opencl/driver_opencl.c
  35. 2 2
      src/drivers/opencl/driver_opencl_utils.c
  36. 1 1
      src/drivers/scc/driver_scc_source.c
  37. 1 1
      src/sched_policies/deque_modeling_policy_data_aware.c

+ 2 - 3
sc_hypervisor/include/sc_hypervisor_policy.h

@@ -94,12 +94,11 @@ double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper*
 /* compute the velocity of a type of worker in a context depending on its history */ 
 double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
-/* check if there are contexts a lot more delayed than others */
-int sc_hypervisor_has_velocity_gap_btw_ctxs(void);
-
 /* get the list of workers grouped by type */
 void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers]);
 
+/* check if we trigger resizing or not */
+unsigned sc_hypervisor_criteria_fulfilled(unsigned sched_ctx, int worker);
 
 #ifdef __cplusplus
 }

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c

@@ -232,7 +232,7 @@ static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct st
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
-		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
 		{
 			int ns = sc_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -21,7 +21,7 @@
 #ifdef STARPU_HAVE_GLPK_H
 static void feft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 {
-	if(sc_hypervisor_has_velocity_gap_btw_ctxs())
+	if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
 	{
 		int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 

+ 1 - 3
sc_hypervisor/src/hypervisor_policies/idle_policy.c

@@ -30,9 +30,7 @@ unsigned worker_belong_to_other_sched_ctx(unsigned sched_ctx, int worker)
 
 void idle_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
-	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
-	struct sc_hypervisor_policy_config *config = sc_w->config;
-	if(config != NULL &&  sc_w->current_idle_time[worker] > config->max_idle[worker])
+	if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
 	{
 		if(worker_belong_to_other_sched_ctx(sched_ctx, worker))
 			sc_hypervisor_remove_workers_from_sched_ctx(&worker, 1, sched_ctx, 1);

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -334,7 +334,7 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct s
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
-		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
 		{
 			int ns = sc_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/ispeed_policy.c

@@ -146,7 +146,7 @@ static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, struct star
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
-		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
 		{
 			unsigned fastest_sched_ctx = _get_fastest_sched_ctx();
 			unsigned slowest_sched_ctx = _get_slowest_sched_ctx();

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c

@@ -168,7 +168,7 @@ static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct sta
 			return;
 		}
 
-		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
 		{
 			int ns = sc_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */

+ 25 - 3
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -16,7 +16,7 @@
 
 
 #include "sc_hypervisor_policy.h"
-
+#include "sc_hypervisor_intern.h"
 static int _compute_priority(unsigned sched_ctx)
 {
 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
@@ -526,7 +526,7 @@ double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper*
 
 
 /* check if there is a big velocity gap between the contexts */
-int sc_hypervisor_has_velocity_gap_btw_ctxs()
+unsigned _check_velocity_gap_btw_ctxs()
 {
 	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
@@ -554,7 +554,7 @@ int sc_hypervisor_has_velocity_gap_btw_ctxs()
 					{
 						double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v ;
 //						if(gap > 1.5)
-						if(gap > 3.0)
+						if(gap > _get_max_velocity_gap())
 							return 1;
 					}
 				}
@@ -628,3 +628,25 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
         }
 }
 
+static unsigned _check_idle(unsigned sched_ctx, int worker)
+{
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_policy_config *config = sc_w->config;
+	if(config != NULL &&  sc_w->current_idle_time[worker] > config->max_idle[worker])
+		return 1;
+	return 0;
+}
+
+unsigned sc_hypervisor_criteria_fulfilled(unsigned sched_ctx, int worker)
+{
+	unsigned criteria = _get_resize_criteria();
+	if(criteria != SC_NOTHING)
+	{
+		if(criteria == SC_IDLE)
+			return _check_idle(sched_ctx, worker);
+		else
+			return _check_velocity_gap_btw_ctxs();
+	}
+	else
+		return 0;
+}

+ 31 - 13
sc_hypervisor/src/sc_hypervisor.c

@@ -133,6 +133,11 @@ struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hyper
 {
 	hypervisor.min_tasks = 0;
 	hypervisor.nsched_ctxs = 0;
+	char* vel_gap = getenv("MAX_VELOCITY_GAP");
+	hypervisor.max_velocity_gap = vel_gap ? atof(vel_gap) : SC_VELOCITY_MAX_GAP_DEFAULT;
+	char* crit =  getenv("HYPERVISOR_TRIGGER_RESIZE");
+	hypervisor.resize_criteria = strcmp(crit,"idle") == 0 ? SC_IDLE : (strcmp(crit,"speed") == 0 ? SC_SPEED : SC_NOTHING);
+
 	starpu_pthread_mutex_init(&act_hypervisor_mutex, NULL);
 	hypervisor.start_executing_time = starpu_timing_now();
 	int i;
@@ -210,21 +215,24 @@ void sc_hypervisor_start_resize(unsigned sched_ctx)
 
 static void _print_current_time()
 {
-	double curr_time = starpu_timing_now();
-	double elapsed_time = (curr_time - hypervisor.start_executing_time) / 1000000.0; /* in seconds */
-	fprintf(stdout, "Time: %lf\n", elapsed_time);
-	int i;
-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	if(!getenv("HYPERVISOR_STOP_PRINT"))
 	{
-		if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS)
+		double curr_time = starpu_timing_now();
+		double elapsed_time = (curr_time - hypervisor.start_executing_time) / 1000000.0; /* in seconds */
+		fprintf(stdout, "Time: %lf\n", elapsed_time);
+		int i;
+		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 		{
-			struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
-
-			double cpu_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
-			double cuda_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
-			int ncpus = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CPU_WORKER);
-			int ncuda = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CUDA_WORKER);
-			fprintf(stdout, "%d: cpu_v = %lf cuda_v = %lf ncpus = %d ncuda = %d\n", hypervisor.sched_ctxs[i], cpu_speed, cuda_speed, ncpus, ncuda);
+			if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS)
+			{
+				struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
+				
+				double cpu_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+				double cuda_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
+				int ncpus = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CPU_WORKER);
+				int ncuda = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CUDA_WORKER);
+				fprintf(stdout, "%d: cpu_v = %lf cuda_v = %lf ncpus = %d ncuda = %d\n", hypervisor.sched_ctxs[i], cpu_speed, cuda_speed, ncpus, ncuda);
+			}
 		}
 	}
 	return;
@@ -364,6 +372,16 @@ static double _get_best_total_elapsed_flops(struct sc_hypervisor_wrapper* sc_w,
 	return ret_val;
 }
 
+double _get_max_velocity_gap()
+{
+	return hypervisor.max_velocity_gap;
+}
+
+unsigned _get_resize_criteria()
+{
+	return hypervisor.resize_criteria;
+}
+
 /* compute an average value of the cpu/cuda velocity */
 double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
 {

+ 15 - 0
sc_hypervisor/src/sc_hypervisor_intern.h

@@ -16,6 +16,11 @@
 
 #include <sc_hypervisor.h>
 #include <common/uthash.h>
+
+#define SC_VELOCITY_MAX_GAP_DEFAULT 50
+#define SC_NOTHING 0
+#define SC_IDLE 1
+#define SC_SPEED 2
 struct size_request
 {
 	int *workers;
@@ -74,6 +79,12 @@ struct sc_hypervisor
 
 	/* time when the hypervisor started */
 	double start_executing_time;
+
+	/* max velocity diff btw ctx before triggering resizing */
+	double max_velocity_gap;
+	
+	/* criteria to trigger resizing */
+	unsigned resize_criteria;
 };
 
 struct sc_hypervisor_adjustment
@@ -88,3 +99,7 @@ struct sc_hypervisor hypervisor;
 void _add_config(unsigned sched_ctx);
 
 void _remove_config(unsigned sched_ctx);
+
+double _get_max_velocity_gap();
+
+unsigned _get_resize_criteria();

+ 1 - 1
src/common/utils.c

@@ -134,7 +134,7 @@ char *_starpu_get_home_path(void)
 		static int warn;
 		if (!warn) {
 			warn = 1;
-			_STARPU_DISP("couldn't find a home place to put starpu data, using /tmp\n");
+			_STARPU_DISP("couldn't find a $STARPU_HOME place to put .starpu data, using /tmp\n");
 		}
 		path = "/tmp";
 	}

+ 3 - 3
src/core/combined_workers.c

@@ -125,12 +125,12 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 #ifdef CPU_OR
 		CPU_OR(&combined_worker->cpu_set,
 			&combined_worker->cpu_set,
-			&config->workers[id].initial_cpu_set);
+			&config->workers[id].cpu_set);
 #else
 		int j;
 		for (j = 0; j < CPU_SETSIZE; j++)
 		{
-			if (CPU_ISSET(j, &config->workers[id].initial_cpu_set))
+			if (CPU_ISSET(j, &config->workers[id].cpu_set))
 				CPU_SET(j, &combined_worker->cpu_set);
 		}
 #endif
@@ -139,7 +139,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 #ifdef STARPU_HAVE_HWLOC
 		hwloc_bitmap_or(combined_worker->hwloc_cpu_set,
 				combined_worker->hwloc_cpu_set,
-				config->workers[id].initial_hwloc_cpu_set);
+				config->workers[id].hwloc_cpu_set);
 #endif
 #endif
 	}

+ 1 - 1
src/core/dependencies/tags.c

@@ -288,7 +288,7 @@ void starpu_tag_restart(starpu_tag_t id)
 	struct _starpu_tag *tag = gettag_struct(id);
 
 	_starpu_spin_lock(&tag->lock);
-	STARPU_ASSERT_MSG(tag->state == STARPU_DONE, "Only completed tags can be restarted (was %d)", tag->state);
+	STARPU_ASSERT_MSG(tag->state == STARPU_DONE, "Only completed tags can be restarted (%llu was %d)", (unsigned long long) id, tag->state);
 	tag->state = STARPU_BLOCKED;
 	_starpu_spin_unlock(&tag->lock);
 }

+ 1 - 1
src/core/jobs.c

@@ -313,7 +313,7 @@ static unsigned _starpu_not_all_tag_deps_are_fulfilled(struct _starpu_job *j)
 	struct _starpu_cg_list *tag_successors = &tag->tag_successors;
 
 	_starpu_spin_lock(&tag->lock);
-	STARPU_ASSERT_MSG(tag->is_assigned == 1 || !tag_successors->ndeps, "a tag can be assigned only one task to wake");
+	STARPU_ASSERT_MSG(tag->is_assigned == 1 || !tag_successors->ndeps, "a tag can be assigned only one task to wake (%llu had %u assigned tasks, and %u successors)", (unsigned long long) tag->id, tag->is_assigned, tag_successors->ndeps);
 
 	if (tag_successors->ndeps != tag_successors->ndeps_completed)
 	{

+ 1 - 1
src/core/perfmodel/perfmodel_history.c

@@ -963,7 +963,7 @@ int starpu_perfmodel_list(FILE *output)
         }
         else
 	{
-		_STARPU_DISP("Could not open the perfmodel directory <%s>\n", path);
+		_STARPU_DISP("Could not open the perfmodel directory <%s>: \n", path, strerror(errno));
         }
 	return 0;
 }

+ 2 - 2
src/core/sched_ctx.c

@@ -234,7 +234,7 @@ static void _starpu_sched_ctx_create_hwloc_tree(struct _starpu_sched_ctx *sched_
 		{
 			hwloc_bitmap_or(sched_ctx->hwloc_workers_set,
 					sched_ctx->hwloc_workers_set,
-					config->workers[worker].initial_hwloc_cpu_set);
+					config->workers[worker].hwloc_cpu_set);
 		}
 
 	}
@@ -578,7 +578,7 @@ static void _starpu_check_workers(int *workerids, int nworkers)
 	for(i = 0; i < nworkers; i++)
 	{
 		/* take care the user does not ask for a resource that does not exist */
-		STARPU_ASSERT_MSG(workerids[i] >= 0 &&  workerids[i] <= nworkers_conf, "workerid = %d", workerids[i]);
+		STARPU_ASSERT_MSG(workerids[i] >= 0 &&  workerids[i] <= nworkers_conf, "requested to add workerid = %d, but that is beyond the range 0 to %d", workerids[i], nworkers_conf);
 	}
 }
 

+ 1 - 1
src/core/simgrid.c

@@ -111,7 +111,7 @@ void _starpu_simgrid_execute_job(struct _starpu_job *j, enum starpu_perfmodel_ar
 	{
 		length = starpu_task_expected_length(task, perf_arch, j->nimpl);
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
-			"Codelet %s does not have a perfmodel, or is not calibrated enough",
+				"Codelet %s does not have a perfmodel, or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
 			_starpu_job_get_model_name(j));
 	}
 

+ 3 - 3
src/core/task.c

@@ -363,7 +363,7 @@ void _starpu_task_check_deprecated_fields(struct starpu_task *task)
 	if (task->cl)
 	{
 		unsigned i;
-		for(i=0; i<task->cl->nbuffers ; i++)
+		for(i=0; i<STARPU_MIN(task->cl->nbuffers, STARPU_NMAXBUFS) ; i++)
 		{
 			if (task->buffers[i].handle && task->handles[i])
 			{
@@ -427,7 +427,7 @@ int starpu_task_submit(struct starpu_task *task)
 
 		/* Check buffers */
 		if (task->dyn_handles == NULL)
-			STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d)", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
+			STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
 
 		if (task->dyn_handles)
 		{
@@ -438,7 +438,7 @@ int starpu_task_submit(struct starpu_task *task)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			/* Make sure handles are not partitioned */
-			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data can be used in a task");
+			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
 			/* Provide the home interface for now if any,
 			 * for can_execute hooks */
 			if (handle->home_node != -1)

+ 4 - 9
src/core/topology.c

@@ -1278,10 +1278,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
 #ifdef __GLIBC__
 		/* Save the initial cpuset */
-		CPU_ZERO(&workerarg->initial_cpu_set);
-		CPU_SET(workerarg->bindid, &workerarg->initial_cpu_set);
-		CPU_ZERO(&workerarg->current_cpu_set);
-		CPU_SET(workerarg->bindid, &workerarg->current_cpu_set);
+		CPU_ZERO(&workerarg->cpu_set);
+		CPU_SET(workerarg->bindid, &workerarg->cpu_set);
 #endif /* __GLIBC__ */
 
 #ifdef STARPU_HAVE_HWLOC
@@ -1295,9 +1293,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		worker_obj->userdata = &config->workers[worker];
 
 		/* Clear the cpu set and set the cpu */
-		workerarg->initial_hwloc_cpu_set =
-			hwloc_bitmap_dup (worker_obj->cpuset);
-		workerarg->current_hwloc_cpu_set =
+		workerarg->hwloc_cpu_set =
 			hwloc_bitmap_dup (worker_obj->cpuset);
 #endif
 	}
@@ -1340,8 +1336,7 @@ _starpu_destroy_topology (
 	{
 #ifdef STARPU_HAVE_HWLOC
 		struct _starpu_worker *workerarg = &config->workers[worker];
-		hwloc_bitmap_free(workerarg->initial_hwloc_cpu_set);
-		hwloc_bitmap_free(workerarg->current_hwloc_cpu_set);
+		hwloc_bitmap_free(workerarg->hwloc_cpu_set);
 #endif
 	}
 

+ 1 - 1
src/core/workers.c

@@ -396,7 +396,7 @@ void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
 
         _STARPU_DEBUG("worker %d is ready on logical cpu %d\n", devid, worker->bindid);
 #ifdef STARPU_HAVE_HWLOC
-	_STARPU_DEBUG("worker %d cpuset start at %d\n", devid, hwloc_bitmap_first(worker->initial_hwloc_cpu_set));
+	_STARPU_DEBUG("worker %d cpuset start at %d\n", devid, hwloc_bitmap_first(worker->hwloc_cpu_set));
 #endif
 
 	_starpu_memory_node_set_local_key(&worker->memory_node);

+ 2 - 4
src/core/workers.h

@@ -106,12 +106,10 @@ struct _starpu_worker
 	unsigned parallel_sect;
 
 #ifdef __GLIBC__
-	cpu_set_t initial_cpu_set;
-	cpu_set_t current_cpu_set;
+	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */
 #ifdef STARPU_HAVE_HWLOC
-	hwloc_bitmap_t initial_hwloc_cpu_set;
-	hwloc_bitmap_t current_hwloc_cpu_set;
+	hwloc_bitmap_t hwloc_cpu_set;
 #endif
 };
 

+ 1 - 1
src/datawizard/data_request.c

@@ -162,7 +162,7 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 
 	retval = r->retval;
 	if (retval)
-		_STARPU_DISP("REQUEST %p COMPLETED (retval %d) !\n", r, r->retval);
+		_STARPU_DISP("REQUEST %p completed with retval %d!\n", r, r->retval);
 
 
 	r->refcnt--;

+ 7 - 6
src/datawizard/filters.c

@@ -75,7 +75,8 @@ int starpu_data_get_nb_children(starpu_data_handle_t handle)
 
 starpu_data_handle_t starpu_data_get_child(starpu_data_handle_t handle, unsigned i)
 {
-	STARPU_ASSERT_MSG(i < handle->nchildren, "Invalid child index %u, maximum %u", i, handle->nchildren);
+	STARPU_ASSERT_MSG(handle->nchildren != 0, "Data %p has to be partitioned before accessing children", handle);
+	STARPU_ASSERT_MSG(i < handle->nchildren, "Invalid child index %u in handle %p, maximum %u", i, handle, handle->nchildren);
 	return &handle->children[i];
 }
 
@@ -104,8 +105,8 @@ starpu_data_handle_t starpu_data_vget_sub_data(starpu_data_handle_t root_handle,
 		unsigned next_child;
 		next_child = va_arg(pa, unsigned);
 
-		STARPU_ASSERT_MSG(current_handle->nchildren != 0, "Data has to be partitioned before accessing children");
-		STARPU_ASSERT_MSG(next_child < current_handle->nchildren, "Bogus child number");
+		STARPU_ASSERT_MSG(current_handle->nchildren != 0, "Data %p has to be partitioned before accessing children", current_handle);
+		STARPU_ASSERT_MSG(next_child < current_handle->nchildren, "Bogus child number %u, data %p only has %u children", next_child, current_handle, current_handle->nchildren);
 
 		current_handle = &current_handle->children[next_child];
 	}
@@ -122,7 +123,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 	/* first take care to properly lock the data header */
 	_starpu_spin_lock(&initial_handle->header_lock);
 
-	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data");
+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data %p, futher filtering has to be done on children", initial_handle);
 
 	/* how many parts ? */
 	if (f->get_nchildren)
@@ -130,7 +131,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 	else
 	  nparts = f->nchildren;
 
-	STARPU_ASSERT_MSG(nparts > 0, "Partitioning in 0 piece does not make sense");
+	STARPU_ASSERT_MSG(nparts > 0, "Partitioning data %p in 0 piece does not make sense", initial_handle);
 
 	/* allocate the children */
 	starpu_data_create_children(initial_handle, nparts, f);
@@ -277,7 +278,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
 	_starpu_spin_lock(&root_handle->header_lock);
 
-	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data is not partitioned");
+	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data %p is not partitioned, can not unpartition it", root_handle);
 
 	/* first take all the children lock (in order !) */
 	for (child = 0; child < root_handle->nchildren; child++)

+ 5 - 5
src/datawizard/interfaces/data_interface.c

@@ -421,7 +421,7 @@ int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 	entry = (struct handle_tag_entry *) malloc(sizeof(*entry));
 	STARPU_ASSERT(entry != NULL);
 
-	STARPU_ASSERT_MSG(!(starpu_data_get_data_handle_from_tag(tag)),"A data handle with tag %d had already been registered.\n",tag);
+	STARPU_ASSERT_MSG(!(starpu_data_get_data_handle_from_tag(tag)),"data handle %p already has tag %d\n", starpu_data_get_data_handle_from_tag(tag), tag);
 
 	entry->tag = tag;
 	entry->handle = handle;
@@ -442,7 +442,7 @@ int starpu_data_release_tag(starpu_data_handle_t handle)
 	{
 		_starpu_spin_lock(&registered_tag_handles_lock);
 		HASH_FIND_INT(registered_tag_handles, &handle->tag, tag_entry);
-		STARPU_ASSERT_MSG((tag_entry != NULL),"Handle %p with tag %d isn't in the hashmap !",handle,handle->tag);
+		STARPU_ASSERT_MSG((tag_entry != NULL),"Data handle %p with tag %d isn't in the hashmap !",handle,handle->tag);
 
 		HASH_DEL(registered_tag_handles, tag_entry);
 		free(tag_entry);
@@ -559,7 +559,7 @@ static void _starpu_data_unregister_fetch_data_callback(void *_arg)
 static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned coherent)
 {
 	STARPU_ASSERT(handle);
-	STARPU_ASSERT_MSG(handle->nchildren == 0, "data needs to be unpartitioned before unregistration");
+	STARPU_ASSERT_MSG(handle->nchildren == 0, "data %p needs to be unpartitioned before unregistration", handle);
 
 	if (coherent)
 	{
@@ -736,7 +736,7 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
 void starpu_data_unregister(starpu_data_handle_t handle)
 {
-	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data must not be unregistered twice");
+	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data %p can not be unregistered twice", handle);
 	_starpu_data_unregister(handle, 1);
 }
 
@@ -748,7 +748,7 @@ void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
 void starpu_data_unregister_submit(starpu_data_handle_t handle)
 {
 	_starpu_spin_lock(&handle->header_lock);
-	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data must not be unregistered twice");
+	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data %p can not be unregistered twice", handle);
 	handle->lazy_unregister = 1;
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_data_unregister(handle, 0);

+ 1 - 1
src/datawizard/malloc.c

@@ -27,7 +27,7 @@ static size_t _malloc_align = sizeof(void*);
 
 void starpu_malloc_set_align(size_t align)
 {
-	STARPU_ASSERT_MSG(!(align & (align - 1)), "Alignment given to starpu_malloc_set_align must be a power of two");
+	STARPU_ASSERT_MSG(!(align & (align - 1)), "Alignment given to starpu_malloc_set_align (%lu) must be a power of two", (unsigned long) align);
 	if (_malloc_align < align)
 		_malloc_align = align;
 }

+ 58 - 32
src/datawizard/memalloc.c

@@ -135,6 +135,8 @@ static void transfer_subtree_to_node(starpu_data_handle_t handle, unsigned src_n
 	unsigned cnt;
 	int ret;
 
+	STARPU_ASSERT(dst_node != src_node);
+
 	if (handle->nchildren == 0)
 	{
 		struct _starpu_data_replicate *src_replicate = &handle->per_node[src_node];
@@ -210,6 +212,23 @@ static void transfer_subtree_to_node(starpu_data_handle_t handle, unsigned src_n
 	}
 }
 
+static void notify_handle_children(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node)
+{
+	unsigned child;
+
+	replicate->allocated = 0;
+
+	/* XXX why do we need that ? */
+	replicate->automatically_allocated = 0;
+
+	for (child = 0; child < handle->nchildren; child++)
+	{
+		/* Notify children that their buffer has been deallocated too */
+		starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
+		notify_handle_children(child_handle, &child_handle->per_node[node], node);
+	}
+}
+
 static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 {
 	size_t freed = 0;
@@ -244,12 +263,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 		mc->ops->free_data_on_node(mc->chunk_interface, node);
 
 		if (handle)
-		{
-			replicate->allocated = 0;
-
-			/* XXX why do we need that ? */
-			replicate->automatically_allocated = 0;
-		}
+			notify_handle_children(handle, replicate, node);
 
 		freed = mc->size;
 
@@ -298,6 +312,10 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 	if (handle->wt_mask & (1<<node))
 		return 0;
 
+	/* This data was registered from this node, we will not be able to drop it anyway */
+	if ((int) node == handle->home_node)
+		return 0;
+
 	/* REDUX memchunk */
 	if (mc->relaxed_coherency == 2)
 	{
@@ -332,26 +350,35 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 		/* check if they are all "free" */
 		if (may_free_subtree(handle, node))
 		{
-			STARPU_ASSERT(handle->per_node[node].refcnt == 0);
+			int target = -1;
 
-#ifdef STARPU_MEMORY_STATS
-			if (handle->per_node[node].state == STARPU_OWNER)
-				_starpu_memory_handle_stats_invalidated(handle, node);
-			/* else XXX Considering only owner to invalidate */
-#endif
+			/* XXX Considering only owner to invalidate */
+
+			STARPU_ASSERT(handle->per_node[node].refcnt == 0);
 
 			/* in case there was nobody using that buffer, throw it
-			 * away after writing it back to main memory if we can*/
+
+
+
+
+
+			 * away after writing it back to main memory */
+			if (handle->home_node != -1)
+				target = handle->home_node;
+			else
+				/* NULL-registered data, push to RAM if it's not what we are flushing */
+				if (node != 0)
+					target = 0;
+
+
+
+
 
 			size_t size_handle = _starpu_data_get_size(handle);
 
 			if (_starpu_memory_manager_test_allocate_size_(size_handle, STARPU_MAIN_RAM) == 1)
 			{
-				transfer_subtree_to_node(handle, node, STARPU_MAIN_RAM);
-
-#ifdef STARPU_MEMORY_STATS
-				_starpu_memory_handle_stats_loaded_owner(handle, STARPU_MAIN_RAM);
-#endif
+				target = STARPU_MAIN_RAM;
 			}
 			else
 			{	
@@ -360,7 +387,6 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 				unsigned nnodes = starpu_memory_nodes_get_count();
 				unsigned int i;
 				double time_disk = 0;
-				unsigned disk = 0;
 				
 				for (i = 0; i < nnodes; i++)
 				{
@@ -369,28 +395,28 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 						/* only time can change between disk <-> main_ram 
 						 * and not between main_ram <-> worker if we compare diks*/
 						double time_tmp = _starpu_predict_transfer_time(i, STARPU_MAIN_RAM, size_handle);
-						if (disk == 0 || time_disk > time_tmp)
+						if (target == -1 || time_disk > time_tmp)
 						{
-							disk = i;
+							target = i;
 							time_disk = time_tmp;
 						}	
 					}
 				}
+			}      
 
-				STARPU_ASSERT_MSG(disk != 0, "MEMORY FULL");
-
-				/* transfer */
-				transfer_subtree_to_node(handle, node, disk);
 
+			if (target != -1)
+			{
+				transfer_subtree_to_node(handle, node, target);
 #ifdef STARPU_MEMORY_STATS
-				_starpu_memory_handle_stats_loaded_owner(handle, disk);
-#endif				
-				
-			}      
-			STARPU_ASSERT(handle->per_node[node].refcnt == 0);
+				_starpu_memory_handle_stats_loaded_owner(handle, target);
+#endif
 
-			/* now the actual buffer may be freed */
-			freed = do_free_mem_chunk(mc, node);
+				STARPU_ASSERT(handle->per_node[node].refcnt == 0);
+
+				/* now the actual buffer may be freed */
+				freed = do_free_mem_chunk(mc, node);
+			}
 		}
 
 		/* unlock the leafs */

+ 5 - 5
src/datawizard/reduction.c

@@ -250,8 +250,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
 						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 
-					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet has to be RW");
-					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet has to be R");
+					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
+					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
 
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);
@@ -309,7 +309,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
 			if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
 				STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_W, 0);
-			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_W, "Parameter of initialization codelet has to be W");
+			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_W, "Parameter of initialization codelet %p has to be W", redux_task->cl);
 
 			STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
 
@@ -338,8 +338,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 			if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1))
 				STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 
-			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet has to be RW");
-			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet has to be R");
+			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
+			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
 
 			STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
 			STARPU_TASK_SET_HANDLE(redux_task, replicate_array[replicate], 1);

+ 1 - 1
src/datawizard/user_interactions.c

@@ -118,7 +118,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 {
 	STARPU_ASSERT(handle);
-	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data is not possible");
+	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data (%p) is not possible", handle);
         _STARPU_LOG_IN();
 
 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) malloc(sizeof(struct user_interaction_wrapper));

+ 1 - 1
src/debug/traces/starpu_fxt.c

@@ -158,7 +158,7 @@ static void register_worker_id(unsigned long tid, int workerid)
 
 	HASH_FIND(hh, worker_ids, &tid, sizeof(tid), entry);
 
-	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase the maximum number of CPUs and GPUs to the same value as was used for execution");
+	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase in ./configure invocation the maximum number of CPUs and GPUs to the same value as was used for execution");
 
 	/* only register a thread once */
 	STARPU_ASSERT(entry == NULL);

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -68,7 +68,7 @@ _starpu_cpu_discover_devices(struct _starpu_machine_config *config)
 
 	if (config->cpu_depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
 		/* unknown, using logical procesors as fallback */
-		_STARPU_DISP("Warning: OS did not report CPU cores. Assuming there is only one thread per core.\n");
+		_STARPU_DISP("Warning: The OS did not report CPU cores. Assuming there is only one hardware thread per core.\n");
 		config->cpu_depth = hwloc_get_type_depth(topology->hwtopology,
 							 HWLOC_OBJ_PU);
 	}

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -481,7 +481,7 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 		switch (res)
 		{
 			case -EAGAIN:
-				_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
+				_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
 				_starpu_push_task_to_workers(task);
 				STARPU_ABORT();
 			default:

+ 1 - 1
src/drivers/mic/driver_mic_source.c

@@ -685,7 +685,7 @@ void *_starpu_mic_src_worker(void *arg)
 			switch (res)
 			{
 				case -EAGAIN:
-					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
+					_STARPU_DISP("ouch, Xeon Phi could not actually run task %p, putting it back...\n", task);
 					_starpu_push_task_to_workers(task);
 					STARPU_ABORT();
 					continue;

+ 1 - 1
src/drivers/mp_common/source_common.c

@@ -65,7 +65,7 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 						&arg_size);
 
 	if (answer == STARPU_ERROR_LOOKUP) {
-		_STARPU_DISP("Error looking up %s\n", func_name);
+		_STARPU_DISP("Error looking up symbol %s\n", func_name);
 		return -ESPIPE;
 	}
 

+ 1 - 1
src/drivers/opencl/driver_opencl.c

@@ -677,7 +677,7 @@ int _starpu_opencl_driver_run_once(struct starpu_driver *d)
 		switch (res)
 		{
 			case -EAGAIN:
-				_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
+				_STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
 				_starpu_push_task_to_workers(task);
 				STARPU_ABORT();
 				return 0;

+ 2 - 2
src/drivers/opencl/driver_opencl_utils.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -271,7 +271,7 @@ int _starpu_opencl_compile_or_load_opencl_from_string(const char *opencl_program
 		// Create the compute program from the source buffer
 		program = clCreateProgramWithSource(context, 1, (const char **) &opencl_program_source, NULL, &err);
 		if (!program || err != CL_SUCCESS) {
-			_STARPU_DISP("Error: Failed to load program source!\n");
+			_STARPU_DISP("Error: Failed to load program source with options %s!\n", build_options);
 			return EXIT_FAILURE;
 		}
 

+ 1 - 1
src/drivers/scc/driver_scc_source.c

@@ -387,7 +387,7 @@ void *_starpu_scc_src_worker(void *arg)
 			switch (res)
 			{
 				case -EAGAIN:
-					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
+					_STARPU_DISP("ouch, SCC could not actually run task %p, putting it back...\n", task);
 					_starpu_push_task_to_workers(task);
 					STARPU_ABORT();
 					continue;

+ 1 - 1
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -212,7 +212,7 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 	int workerid = starpu_worker_get_id();
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
-	STARPU_ASSERT_MSG(fifo, "worker %d does not belong to ctx %d anymore \n", workerid, sched_ctx_id);
+	STARPU_ASSERT_MSG(fifo, "worker %d does not belong to ctx %d anymore.\n", workerid, sched_ctx_id);
 
 	task = _starpu_fifo_pop_local_task(fifo);
 	if (task)