Просмотр исходного кода

Add STARPU_PER_WORKER variant for perfmodels

Samuel Thibault лет назад: 5
Родитель
Сommit
ba99542d83

+ 1 - 0
ChangeLog

@@ -29,6 +29,7 @@ New features:
   * New number_events.data trace file which monitors number of events in trace
   * New number_events.data trace file which monitors number of events in trace
     files. This file can be parsed by the new script
     files. This file can be parsed by the new script
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
+  * New STARPU_PER_WORKER perfmodel.
 
 
 Small changes:
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
   * Use the S4U interface of Simgrid instead of xbt and MSG.

+ 4 - 3
doc/doxygen/chapters/320_scheduling.doxy

@@ -190,9 +190,10 @@ single task gives the consumption of the task in Joules, which can be given to
 starpu_perfmodel_update_history().
 starpu_perfmodel_update_history().
 
 
 Another way to provide the energy performance is to define a
 Another way to provide the energy performance is to define a
-perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH, and set the
-starpu_perfmodel::arch_cost_function field to a function which shall return the
-estimated consumption of the task in Joules. Such a function can for instance
+perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH or
+::STARPU_PER_WORKER , and set the starpu_perfmodel::arch_cost_function or
+starpu_perfmodel::worker_cost_function field to a function which shall return
+the estimated consumption of the task in Joules. Such a function can for instance
 use starpu_task_expected_length() on the task (in µs), multiplied by the
 use starpu_task_expected_length() on the task (in µs), multiplied by the
 typical power consumption of the device, e.g. in W, and divided by 1000000. to
 typical power consumption of the device, e.g. in W, and divided by 1000000. to
 get Joules.
 get Joules.

+ 2 - 1
doc/doxygen/chapters/350_scheduling_policy_definition.doxy

@@ -45,7 +45,8 @@ provides a complete list of the functions available for writing advanced schedul
 This includes getting an estimation for a task computation completion with
 This includes getting an estimation for a task computation completion with
 starpu_task_expected_length(), for the required data transfers with
 starpu_task_expected_length(), for the required data transfers with
 starpu_task_expected_data_transfer_time_for(), for the required energy with
 starpu_task_expected_data_transfer_time_for(), for the required energy with
-starpu_task_expected_energy(), etc. Other
+starpu_task_expected_energy(), etc. Per-worker variants are also available with
+starpu_task_worker_expected_length(), etc. Other
 useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
 useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
 starpu_transfer_predict(), ...
 starpu_transfer_predict(), ...
 One can also directly test the presence of a data handle with starpu_data_is_on_node().
 One can also directly test the presence of a data handle with starpu_data_is_on_node().

+ 5 - 0
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -401,6 +401,11 @@ filled with pointers to functions which return the expected duration
 of the task in micro-seconds, one per architecture, see for instance
 of the task in micro-seconds, one per architecture, see for instance
 <c>tests/datawizard/locality.c</c>
 <c>tests/datawizard/locality.c</c>
 </li>
 </li>
+
+<li>
+Provided explicitly by the application (model type ::STARPU_PER_WORKER)
+similarly with the starpu_perfmodel::worker_cost_function field.
+</li>
 </ul>
 </ul>
 
 
 For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
 For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and

+ 8 - 1
include/starpu_perfmodel.h

@@ -165,6 +165,7 @@ struct starpu_perfmodel_per_arch
 enum starpu_perfmodel_type
 enum starpu_perfmodel_type
 {
 {
         STARPU_PERFMODEL_INVALID=0,
         STARPU_PERFMODEL_INVALID=0,
+	STARPU_PER_WORKER,                /**< Application-provided per-worker cost model function */
 	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
 	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
 	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
 	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
 	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
 	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
@@ -226,11 +227,17 @@ struct starpu_perfmodel
 	*/
 	*/
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 	/**
 	/**
-	   Used by ::STARPU_COMMON. Take a task, an arch and implementation
+	   Used by ::STARPU_PER_ARCH. Take a task, an arch and implementation
 	   number, and must return a task duration estimation in
 	   number, and must return a task duration estimation in
 	   micro-seconds on that arch.
 	   micro-seconds on that arch.
 	*/
 	*/
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
+	/**
+	   Used by ::STARPU_PER_WORKER. Take a task, a worker id and implementation
+	   number, and must return a task duration estimation in
+	   micro-seconds on that worker.
+	*/
+	double (*worker_cost_function)(struct starpu_task *, unsigned workerid, unsigned nimpl);
 
 
 	/**
 	/**
 	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
 	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and

+ 14 - 0
include/starpu_scheduler.h

@@ -110,6 +110,10 @@ struct starpu_sched_policy
 	   to be executed by the worker. This method therefore permits
 	   to be executed by the worker. This method therefore permits
 	   to keep the state of the scheduler coherent even when
 	   to keep the state of the scheduler coherent even when
 	   StarPU bypasses the scheduling strategy.
 	   StarPU bypasses the scheduling strategy.
+
+	   Note: to get an estimation of the task duration, \p perf_workerid
+	   needs to be used rather than \p workerid, for the case of parallel
+	   tasks.
 	*/
 	*/
 	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
 	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
 
 
@@ -366,6 +370,11 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task);
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
 
 /**
 /**
+   Same as starpu_task_expected_length() but for a precise worker.
+*/
+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
+
+/**
    Return an estimated speedup factor relative to CPU speed
    Return an estimated speedup factor relative to CPU speed
 */
 */
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
@@ -395,6 +404,11 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
 
 /**
 /**
+   Same as starpu_task_expected_energy but for a precise worker
+*/
+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
+
+/**
    Return expected conversion time in ms (multiformat interface only)
    Return expected conversion time in ms (multiformat interface only)
 */
 */
 double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);

+ 46 - 0
src/core/perfmodel/perfmodel.c

@@ -81,6 +81,20 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsi
 }
 }
 
 
 /*
 /*
+ * PER WORKER model
+ */
+
+static double per_worker_task_expected_perf(struct starpu_perfmodel *model, unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	double (*worker_cost_function)(struct starpu_task *task, unsigned workerid, unsigned nimpl);
+
+	worker_cost_function = model->worker_cost_function;
+	STARPU_ASSERT_MSG(worker_cost_function, "STARPU_PER_WORKER needs worker_cost_function to be defined");
+
+	return worker_cost_function(task, workerid, nimpl);
+}
+
+/*
  * PER ARCH model
  * PER ARCH model
  */
  */
 
 
@@ -156,6 +170,7 @@ void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 
 
 	switch (model->type)
 	switch (model->type)
 	{
 	{
+		case STARPU_PER_WORKER:
 		case STARPU_PER_ARCH:
 		case STARPU_PER_ARCH:
 		case STARPU_COMMON:
 		case STARPU_COMMON:
 			/* Nothing more to do than init */
 			/* Nothing more to do than init */
@@ -220,6 +235,20 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 	return exp_perf;
 	return exp_perf;
 }
 }
 
 
+static double starpu_model_worker_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!model)
+		return 0.0;
+
+	if (model->type == STARPU_PER_WORKER)
+		return per_worker_task_expected_perf(model, workerid, task, nimpl);
+	else
+	{
+		struct starpu_perfmodel_arch *per_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
+		return starpu_model_expected_perf(task, model, per_arch, nimpl);
+	}
+}
+
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	if (!task->cl)
 	if (!task->cl)
@@ -228,6 +257,14 @@ double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfm
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 }
 }
 
 
+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!task->cl)
+		/* Tasks without codelet don't actually take time */
+		return 0.0;
+	return starpu_model_worker_expected_perf(task, task->cl->model, workerid, sched_ctx_id, nimpl);
+}
+
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	if (!task->cl)
 	if (!task->cl)
@@ -236,6 +273,15 @@ double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfm
 	return starpu_model_expected_perf(task, task->cl->energy_model, arch, nimpl);
 	return starpu_model_expected_perf(task, task->cl->energy_model, arch, nimpl);
 }
 }
 
 
+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!task->cl)
+		/* Tasks without codelet don't actually take time */
+		return 0.0;
+	return starpu_model_worker_expected_perf(task, task->cl->energy_model, workerid, sched_ctx_id, nimpl);
+
+}
+
 double starpu_task_expected_conversion_time(struct starpu_task *task,
 double starpu_task_expected_conversion_time(struct starpu_task *task,
 					    struct starpu_perfmodel_arch* arch,
 					    struct starpu_perfmodel_arch* arch,
 					    unsigned nimpl)
 					    unsigned nimpl)

+ 5 - 2
src/sched_policies/component_sched.c

@@ -49,7 +49,6 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 	    workerid != -1;
 	    workerid != -1;
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	{
 	{
-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 		int nimpl;
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 		{
@@ -59,9 +58,13 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 				double d;
 				double d;
 				can_execute = 1;
 				can_execute = 1;
 				if(bundle)
 				if(bundle)
+				{
+					struct starpu_perfmodel_arch* archtype =
+						starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 					d = starpu_task_bundle_expected_length(bundle, archtype, nimpl);
 					d = starpu_task_bundle_expected_length(bundle, archtype, nimpl);
+				}
 				else
 				else
-					d = starpu_task_expected_length(task, archtype, nimpl);
+					d = starpu_task_worker_expected_length(task, workerid, component->tree->sched_ctx_id, nimpl);
 				if(isnan(d))
 				if(isnan(d))
 				{
 				{
 					*length = d;
 					*length = d;

+ 5 - 6
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -487,7 +487,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 			}
 			}
 
 
 			double exp_end;
 			double exp_end;
-			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
+			double local_length = starpu_task_worker_expected_length(task, worker, sched_ctx_id, nimpl);
 			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
 			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
 
@@ -679,9 +679,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			}
 			}
 			else
 			else
 			{
 			{
-				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
+				local_task_length[worker_ctx][nimpl] = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, nimpl);
 				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
 				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
-				local_energy[worker_ctx][nimpl] = starpu_task_expected_energy(task, perf_arch,nimpl);
+				local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
 				if (conversion_time > 0.0)
 				if (conversion_time > 0.0)
 					local_task_length[worker_ctx][nimpl] += conversion_time;
 					local_task_length[worker_ctx][nimpl] += conversion_time;
@@ -1100,10 +1100,9 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 {
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
-	/* Compute the expected penality */
-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
 
 
-	double predicted = starpu_task_expected_length(task, perf_arch,
+	/* Compute the expected penality */
+	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
 						       starpu_task_get_implementation(task));
 						       starpu_task_get_implementation(task));
 
 
 	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
 	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);