Kaynağa Gözat

* Add STARPU_TASK_BREAK_ON_SCHED, STARPU_TASK_BREAK_ON_PUSH, and STARPU_TASK_BREAK_ON_POP environment variables to debug schedulers.

Samuel Thibault 9 yıl önce
ebeveyn
işleme
7771735456

+ 2 - 0
ChangeLog

@@ -199,6 +199,8 @@ Small features:
     starpu_mpi_task_insert() functions call
   * Add starpu_fxt_autostart_profiling to be able to avoid autostart.
   * Add arch_cost_function perfmodel function field.
+  * Add STARPU_TASK_BREAK_ON_SCHED, STARPU_TASK_BREAK_ON_PUSH, and
+  STARPU_TASK_BREAK_ON_POP environment variables to debug schedulers.
 
 Changes:
   * Data interfaces (variable, vector, matrix and block) now define

+ 16 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -251,4 +251,20 @@ heuristic based on the duration of the task over CPUs and GPUs to decide between
 the two queues. CPU workers can then pop from the CPU priority queue, and GPU
 workers from the GPU priority queue.
 
+\section DebuggingScheduling Debugging scheduling
+
+All the \ref OnlinePerformanceTools and \ref OfflinePerformanceTools can
+be used to get information about how well the execution proceeded, and thus the
+overall quality of the execution.
+
+Precise debugging can also be performed by using the \ref
+STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and \ref
+STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
+in these environment variables, StarPU will raise SIGTRAP when the task is being
+scheduled, pushed, or popped by the scheduler. That means that when one notices
+that a task is being scheduled in a seemingly odd way, one can just reexecute
+the application in a debugger, with some of those variables set, and the
+execution will stop exactly at the scheduling points of that task, thus allowing
+to inspect the scheduler state etc.
+
 */

+ 29 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -794,6 +794,35 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 </dd>
 
+<dt>STARPU_TASK_BREAK_ON_SCHED</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_SCHED
+\addindex __env__STARPU_TASK_BREAK_ON_SCHED
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being scheduled by the scheduler (at a scheduler-specific
+point), which will be nicely catched by debuggers.
+This only works for schedulers which have such a scheduling point defined.
+See \ref DebuggingScheduling
+</dd>
+
+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_PUSH
+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers.
+See \ref DebuggingScheduling
+</dd>
+
+<dt>STARPU_TASK_BREAK_ON_POP</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_POP
+\addindex __env__STARPU_TASK_BREAK_ON_POP
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being popped from the scheduler, which will be nicely catched by debuggers.
+See \ref DebuggingScheduling
+</dd>
+
 <dt>STARPU_DISABLE_KERNELS</dt>
 <dd>
 \anchor STARPU_DISABLE_KERNELS

+ 2 - 1
src/core/jobs.c

@@ -87,7 +87,8 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 	job->task = task;
 
 #ifndef STARPU_USE_FXT
-	if (_starpu_bound_recording || _starpu_top_status_get()
+	if (_starpu_bound_recording || _starpu_top_status_get() ||
+		_starpu_task_break_on_push != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_sched != -1
 #ifdef HAVE_AYUDAME_H
 		|| AYU_event
 #endif

+ 16 - 0
src/core/sched_policy.c

@@ -28,6 +28,17 @@ static int use_prefetch = 0;
 static double idle[STARPU_NMAXWORKERS];
 static double idle_start[STARPU_NMAXWORKERS];
 
+long _starpu_task_break_on_push = -1;
+long _starpu_task_break_on_pop = -1;
+long _starpu_task_break_on_sched = -1;
+
+void _starpu_sched_init(void)
+{
+	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", 0);
+	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", 0);
+	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", 0);
+}
+
 int starpu_get_prefetch_flag(void)
 {
 	return use_prefetch;
@@ -579,6 +590,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 				ret = -1;
 			else
 			{
+				_STARPU_TASK_BREAK_ON(task, push);
 				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 				ret = sched_ctx->sched_policy->push_task(task);
 				_STARPU_TRACE_WORKER_SCHEDULING_POP;
@@ -785,6 +797,8 @@ pick:
 	/* perhaps there is some local task to be executed first */
 	task = _starpu_pop_local_task(worker);
 
+	if (task)
+		_STARPU_TASK_BREAK_ON(task, pop);
 
 	/* get tasks from the stacks of the strategy */
 	if(!task)
@@ -835,6 +849,8 @@ pick:
 					 * pushing/popping a scheduling state here, while what we
 					 * want to see in the trace is a permanent idle state. */
 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
+					if (task)
+						_STARPU_TASK_BREAK_ON(task, pop);
 					_starpu_pop_task_end(task);
 				}
 			}

+ 17 - 0
src/core/sched_policy.h

@@ -19,10 +19,13 @@
 #define __SCHED_POLICY_H__
 
 #include <starpu.h>
+#include <signal.h>
 #include <core/workers.h>
 #include <core/sched_ctx.h>
 #include <starpu_scheduler.h>
 
+void _starpu_sched_init(void);
+
 struct starpu_machine_config;
 struct starpu_sched_policy *_starpu_get_sched_policy( struct _starpu_sched_ctx *sched_ctx);
 
@@ -88,4 +91,18 @@ extern struct starpu_sched_policy _starpu_sched_modular_ws_policy;
 extern struct starpu_sched_policy _starpu_sched_modular_heft_policy;
 extern struct starpu_sched_policy _starpu_sched_modular_heft2_policy;
 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
+
+extern long _starpu_task_break_on_push;
+extern long _starpu_task_break_on_pop;
+extern long _starpu_task_break_on_sched;
+
+#ifdef SIGTRAP
+#define _STARPU_TASK_BREAK_ON(task, what) do { \
+	if (_starpu_get_job_associated_to_task(task)->job_id == (unsigned long) _starpu_task_break_on_##what) \
+		raise(SIGTRAP); \
+} while(0)
+#else
+#define _STARPU_TASK_BREAK_ON(task, what) ((void) 0)
+#endif
+
 #endif // __SCHED_POLICY_H__

+ 1 - 0
src/core/workers.c

@@ -1181,6 +1181,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		_starpu_config.conf.not_launched_drivers = copy;
 	}
 
+	_starpu_sched_init();
 	_starpu_job_init();
 	_starpu_graph_init();
 

+ 3 - 1
src/sched_policies/component_heft.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013-2015  Université de Bordeaux
+ * Copyright (C) 2013-2016  Université de Bordeaux
  * Copyright (C) 2013  INRIA
  * Copyright (C) 2013  Simon Archipoff
  *
@@ -26,6 +26,7 @@
 #include <starpu_perfmodel.h>
 #include "helper_mct.h"
 #include <float.h>
+#include <core/sched_policy.h>
 
 #define NTASKS 5
 
@@ -153,6 +154,7 @@ static int heft_progress_one(struct starpu_sched_component *component)
 			return 1;
 		}
 
+		_STARPU_TASK_BREAK_ON(tasks[best_task], sched);
 		int ret = starpu_sched_component_push_task(component, best_component, tasks[best_task]);
 
 		if (ret)

+ 3 - 1
src/sched_policies/component_mct.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013-2015  Université de Bordeaux
+ * Copyright (C) 2013-2016  Université de Bordeaux
  * Copyright (C) 2013  INRIA
  * Copyright (C) 2013  Simon Archipoff
  *
@@ -21,6 +21,7 @@
 #include <starpu_perfmodel.h>
 #include "helper_mct.h"
 #include <float.h>
+#include <core/sched_policy.h>
 
 static int mct_push_task(struct starpu_sched_component * component, struct starpu_task * task)
 {
@@ -99,6 +100,7 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 		return 1;
 	}
 
+	_STARPU_TASK_BREAK_ON(task, sched);
 	int ret = starpu_sched_component_push_task(component, best_component, task);
 	return ret;
 }

+ 2 - 0
src/sched_policies/component_random.c

@@ -17,6 +17,7 @@
 
 #include <starpu_sched_component.h>
 #include <core/workers.h>
+#include <core/sched_policy.h>
 
 static double compute_relative_speedup(struct starpu_sched_component * component)
 {
@@ -90,6 +91,7 @@ static int random_push_task(struct starpu_sched_component * component, struct st
 		return 1;
 	}
 
+	_STARPU_TASK_BREAK_ON(task, sched);
 	int ret_val = starpu_sched_component_push_task(component,select,task);
 	return ret_val;
 }

+ 5 - 0
src/sched_policies/component_work_stealing.c

@@ -22,6 +22,7 @@
 #include <starpu_scheduler.h>
 #include <starpu.h>
 #include <core/workers.h>
+#include <core/sched_policy.h>
 
 #include "prio_deque.h"
 
@@ -63,7 +64,10 @@ static struct starpu_task *  steal_task_round_robin(struct starpu_sched_componen
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(wsd->mutexes[i]);
 		if(task)
+		{
+			_STARPU_TASK_BREAK_ON(task, sched);
 			break;
+		}
 
 		if (i == wsd->last_pop_child)
 		{
@@ -227,6 +231,7 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 	i = (i+1)%component->nchildren;
 
 	STARPU_PTHREAD_MUTEX_LOCK(wsd->mutexes[i]);
+	_STARPU_TASK_BREAK_ON(task, sched);
 	ret = _starpu_prio_deque_push_task(wsd->fifos[i], task);
 	STARPU_PTHREAD_MUTEX_UNLOCK(wsd->mutexes[i]);
 

+ 3 - 0
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -25,6 +25,7 @@
 #include <common/fxt.h>
 #include <core/task.h>
 #include <core/workers.h>
+#include <core/sched_policy.h>
 
 #include <sched_policies/fifo_queues.h>
 #include <limits.h>
@@ -587,6 +588,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
 	starpu_task_set_implementation(task, best_impl);
 
+	_STARPU_TASK_BREAK_ON(task, sched);
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best,
 					model_best, transfer_model_best, prio, sched_ctx_id);
@@ -891,6 +893,7 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
 	starpu_task_set_implementation(task, selected_impl);
 
+	_STARPU_TASK_BREAK_ON(task, sched);
 	if(!simulate)
 	{
 		/* we should now have the best worker in variable "best" */

+ 2 - 0
src/sched_policies/parallel_heft.c

@@ -25,6 +25,7 @@
 #include <core/perfmodel/perfmodel.h>
 #include <starpu_parameters.h>
 #include <core/detect_combined_workers.h>
+#include <core/sched_policy.h>
 
 #ifndef DBL_MIN
 #define DBL_MIN __DBL_MIN__
@@ -481,6 +482,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", nimpl_best);
 	_starpu_get_job_associated_to_task(task)->nimpl = nimpl_best;
 	/* we should now have the best worker in variable "best" */
+	_STARPU_TASK_BREAK_ON(task, sched);
 	return push_task_on_best_worker(task, best, best_exp_end, prio, sched_ctx_id);
 }
 

+ 3 - 1
src/sched_policies/random_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2014  Université de Bordeaux
+ * Copyright (C) 2010-2014, 2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
 #include <starpu_rand.h>
 #include <core/workers.h>
 #include <core/sched_ctx.h>
+#include <core/sched_policy.h>
 #include <sched_policies/fifo_queues.h>
 #ifdef HAVE_AYUDAME_H
 #include <Ayudame.h>
@@ -89,6 +90,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 	}
 #endif
 
+	_STARPU_TASK_BREAK_ON(task, sched);
 	return starpu_push_local_task(selected, task, prio);
 }
 

+ 3 - 0
src/sched_policies/work_stealing_policy.c

@@ -24,6 +24,7 @@
 #include <sched_policies/fifo_queues.h>
 #include <core/debug.h>
 #include <starpu_scheduler.h>
+#include <core/sched_policy.h>
 
 #ifdef HAVE_AYUDAME_H
 #include <Ayudame.h>
@@ -552,6 +553,7 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	if (task)
 	{
 		_STARPU_TRACE_WORK_STEALING(workerid, victim);
+		_STARPU_TASK_BREAK_ON(task, sched);
 		record_data_locality(task, workerid);
 		record_worker_locality(task, workerid, sched_ctx_id);
 		locality_popped_task(task, victim, sched_ctx_id);
@@ -611,6 +613,7 @@ int ws_push_task(struct starpu_task *task)
 #endif
 
 	STARPU_PTHREAD_MUTEX_LOCK(&ws->per_worker[workerid].worker_mutex);
+	_STARPU_TASK_BREAK_ON(task, sched);
 	record_data_locality(task, workerid);
 	_starpu_fifo_push_task(ws->per_worker[workerid].queue_array, task);
 	locality_pushed_task(task, workerid, sched_ctx_id);