浏览代码

Permit the application to provide its own size base for performance models

Samuel Thibault 13 年之前
父节点
当前提交
37862069c7

+ 9 - 1
doc/chapters/advanced-examples.texi

@@ -346,7 +346,7 @@ struct starpu_codelet cl = @{
 @end cartouche
 
 @item
-Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
+Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
 model type). This still assumes performance regularity, but can work
 with various data input sizes, by applying regression over observed
 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
@@ -374,6 +374,14 @@ per architecture.
 
 @end itemize
 
+For the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},
+the total size of task data (both input and output) is used as an index by
+default. The @code{size_base} field of @code{struct starpu_perfmodel} however
+permits the application to override that, when for instance some of the data
+do not matter for task cost (e.g. mere reference table), or when using sparse
+structures (in which case it is the number of non-zeros which matter), or when
+there is some hidden parameter such as the number of iterations, etc.
+
 How to use schedulers which can benefit from such performance model is explained
 in @ref{Task scheduling policy}.
 

+ 5 - 0
doc/chapters/basic-api.texi

@@ -1460,6 +1460,9 @@ used as file name to store the model.
 implementation number, and must return a task duration estimation in micro-seconds.
 @item @code{per_arch}: Used by @code{STARPU_PER_ARCH}: array of @code{struct
 starpu_per_arch_perfmodel} structures.
+@item @code{size_base}: Used by @code{STARPU_HISTORY_BASED} and
+@code{STARPU_*REGRESSION_BASED}. If not NULL, takes a task and implementation
+number, and returns the size to be used as index for history and regression.
 @end table
 @end deftp
 
@@ -1479,6 +1482,8 @@ records all execution history measures.
 Used by @code{STARPU_HISTORY_REGRESION_BASED} and
 @code{STARPU_NL_REGRESSION_BASED}, contains the estimated factors of the
 regression.
+@item @code{size_base}: Same as in @code{struct perfmodel}, but per-arch, in
+case it depends on the architecture-specific implementation.
 @end table
 @end deftp
 

+ 7 - 0
examples/opt/pi/pi.c

@@ -61,6 +61,12 @@ static void cpu_kernel(void *descr[], void *cl_arg)
 	free(random_numbers);
 }
 
+/* The amount of work does not depend on the data size at all :) */
+static size_t size_base(struct starpu_task *task, unsigned nimpl)
+{
+	return NSHOT_PER_TASK;
+}
+
 static void parse_args(int argc, char **argv)
 {
 	int i;
@@ -114,6 +120,7 @@ int main(int argc, char **argv)
 	static struct starpu_perfmodel model =
 	{
 		.type = STARPU_HISTORY_BASED,
+		.size_base = size_base,
 		.symbol = "monte_carlo_pi"
 	};
 

+ 13 - 0
examples/opt/pi/pi_redux.c

@@ -189,6 +189,19 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 }
 #endif
 
+/* The amount of work does not depend on the data size at all :) */
+static size_t size_base(struct starpu_task *task, unsigned nimpl)
+{
+	return NSHOT_PER_TASK;
+}
+
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.size_base = size_base,
+	.symbol = "monte_carlo_pi_redux"
+};
+
 static struct starpu_codelet pi_cl =
 {
 	.where =

+ 3 - 0
include/starpu_perfmodel.h

@@ -135,6 +135,7 @@ struct starpu_per_arch_perfmodel
 {
 	double (*cost_model)(struct starpu_buffer_descr *t) STARPU_DEPRECATED; /* returns expected duration in µs */
 	double (*cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl); /* returns expected duration in µs */
+	size_t (*size_base)(struct starpu_task *, enum starpu_perf_archtype arch, unsigned nimpl);
 
 	/* internal variables */
 	struct starpu_htbl32_node *history;
@@ -163,6 +164,8 @@ struct starpu_perfmodel
 	double (*cost_model)(struct starpu_buffer_descr *) STARPU_DEPRECATED;
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 
+	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
+
 	/* per-architecture model */
 	struct starpu_per_arch_perfmodel per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
 

+ 4 - 4
src/common/fxt.h

@@ -199,11 +199,11 @@ do {									\
 	}								\
 } while(0);
 
-#define _STARPU_TRACE_END_CODELET_BODY(job, archtype)			\
+#define _STARPU_TRACE_END_CODELET_BODY(job, perf_arch, nimpl, archtype)			\
 do {									\
-	const size_t job_size = _starpu_job_get_data_size((job));	\
-	const uint32_t job_hash = _starpu_compute_buffers_footprint(job);\
-	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, job, (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
+	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
+	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
+	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
 } while(0);
 
 #define _STARPU_TRACE_START_CALLBACK(job)	\

+ 16 - 12
src/core/jobs.c

@@ -27,22 +27,26 @@
 #include <profiling/bound.h>
 #include <starpu_top.h>
 
-size_t _starpu_job_get_data_size(struct _starpu_job *j)
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
 {
-	size_t size = 0;
-
 	struct starpu_task *task = j->task;
 
-	unsigned nbuffers = task->cl->nbuffers;
-
-	unsigned buffer;
-	for (buffer = 0; buffer < nbuffers; buffer++)
-	{
-		starpu_data_handle_t handle = task->handles[buffer];
-		size += _starpu_data_get_size(handle);
+	if (model && model->per_arch[arch][nimpl].size_base) {
+		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+	} else if (model && model->size_base) {
+		return model->size_base(task, nimpl);
+	} else {
+		unsigned nbuffers = task->cl->nbuffers;
+		size_t size = 0;
+
+		unsigned buffer;
+		for (buffer = 0; buffer < nbuffers; buffer++)
+		{
+			starpu_data_handle_t handle = task->handles[buffer];
+			size += _starpu_data_get_size(handle);
+		}
+		return size;
 	}
-
-	return size;
 }
 
 /* we need to identify each task to generate the DAG. */

+ 2 - 2
src/core/jobs.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -151,7 +151,7 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j, unsigned
 void _starpu_handle_job_termination(struct _starpu_job *j, unsigned job_is_already_locked);
 
 /* Get the sum of the size of the data accessed by the job. */
-size_t _starpu_job_get_data_size(struct _starpu_job *j);
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
 
 /* Get a task from the local pool of tasks that were explicitly attributed to
  * that worker. */

+ 9 - 9
src/core/perfmodel/perfmodel_history.c

@@ -893,7 +893,7 @@ void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl)
 {
 	double exp = -1.0;
-	size_t size = _starpu_job_get_data_size(j);
+	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	struct starpu_regression_model *regmodel;
 
 	regmodel = &model->per_arch[arch][nimpl].regression;
@@ -907,7 +907,7 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j,unsigned nimpl)
 {
 	double exp = -1.0;
-	size_t size = _starpu_job_get_data_size(j);
+	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	struct starpu_regression_model *regmodel;
 
 	regmodel = &model->per_arch[arch][nimpl].regression;
@@ -916,7 +916,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
 	else
 	{
-		uint32_t key = _starpu_compute_buffers_footprint(j);
+		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 		struct starpu_per_arch_perfmodel *per_arch_model = &model->per_arch[arch][nimpl];
 		struct starpu_htbl32_node *history = per_arch_model->history;
 		struct starpu_history_entry *entry;
@@ -945,7 +945,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 	struct starpu_history_entry *entry;
 	struct starpu_htbl32_node *history;
 
-	uint32_t key = _starpu_compute_buffers_footprint(j);
+	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 
 	per_arch_model = &model->per_arch[arch][nimpl];
 
@@ -989,7 +989,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			struct starpu_htbl32_node *history;
 			struct starpu_htbl32_node **history_ptr;
 			struct starpu_history_list **list;
-			uint32_t key = _starpu_compute_buffers_footprint(j);
+			uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 
 			history = per_arch_model->history;
 			history_ptr = &per_arch_model->history;
@@ -1008,7 +1008,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 					entry->deviation = 0.0;
 					entry->sum2 = measured*measured;
 
-					entry->size = _starpu_job_get_data_size(j);
+					entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
 
 					entry->footprint = key;
 					entry->nsample = 1;
@@ -1037,7 +1037,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			reg_model = &per_arch_model->regression;
 
 			/* update the regression model */
-			size_t job_size = _starpu_job_get_data_size(j);
+			size_t job_size = _starpu_job_get_data_size(model, arch, nimpl, j);
 			double logy, logx;
 			logx = log((double)job_size);
 			logy = log(measured);
@@ -1069,11 +1069,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 		FILE * debug_file = per_arch_model->debug_file;
 
 		if (!j->footprint_is_computed)
-			(void) _starpu_compute_buffers_footprint(j);
+			(void) _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 
 		STARPU_ASSERT(j->footprint_is_computed);
 
-		fprintf(debug_file, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(j), measured, task->predicted, task->predicted_transfer, cpuid);
+		fprintf(debug_file, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, nimpl, j), measured, task->predicted, task->predicted_transfer, cpuid);
 		unsigned i;
 
 		for (i = 0; i < task->cl->nbuffers; i++)

+ 17 - 9
src/datawizard/footprint.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,7 +18,7 @@
 #include <datawizard/footprint.h>
 #include <common/hash.h>
 
-uint32_t _starpu_compute_buffers_footprint(struct _starpu_job *j)
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
 {
 	if (j->footprint_is_computed)
 		return j->footprint;
@@ -28,13 +28,21 @@ uint32_t _starpu_compute_buffers_footprint(struct _starpu_job *j)
 
 	struct starpu_task *task = j->task;
 
-	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
-	{
-		starpu_data_handle_t handle = task->handles[buffer];
-
-		uint32_t handle_footprint = _starpu_data_get_footprint(handle);
-
-		footprint = _starpu_crc32_be(handle_footprint, footprint);
+	if (model && model->per_arch[arch][nimpl].size_base) {
+		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+		footprint = _starpu_crc32_be_n(&size, sizeof(size), footprint);
+	} else if (model && model->size_base) {
+		size_t size = model->size_base(task, nimpl);
+		footprint = _starpu_crc32_be_n(&size, sizeof(size), footprint);
+	} else {
+		for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
+		{
+			starpu_data_handle_t handle = task->handles[buffer];
+
+			uint32_t handle_footprint = _starpu_data_get_footprint(handle);
+
+			footprint = _starpu_crc32_be(handle_footprint, footprint);
+		}
 	}
 
 	j->footprint = footprint;

+ 2 - 2
src/datawizard/footprint.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,7 +24,7 @@
 
 /* Compute the footprint that characterizes the job and cache it into the job
  * structure. */
-uint32_t _starpu_compute_buffers_footprint(struct _starpu_job *j);
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
 
 /* Compute the footprint that characterizes the layout of the data handle. */
 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -60,7 +60,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 		func(task->interfaces, task->cl_arg);
 	}
 
-	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);
+	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank);
 
 	if (is_parallel_task)
 		_STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -221,7 +221,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 	STARPU_ASSERT(func);
 	func(task->interfaces, task->cl_arg);
 
-	_starpu_driver_end_job(args, j, &codelet_end, 0);
+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
 
 	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
 

+ 2 - 2
src/drivers/driver_common/driver_common.c

@@ -60,7 +60,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	_STARPU_TRACE_START_CODELET_BODY(j);
 }
 
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_end, int rank)
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch, struct timespec *codelet_end, int rank)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
@@ -71,7 +71,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	unsigned calibrate_model = 0;
 	enum starpu_perf_archtype archtype STARPU_ATTRIBUTE_UNUSED = args->perf_arch;
 
-	_STARPU_TRACE_END_CODELET_BODY(j, archtype);
+	_STARPU_TRACE_END_CODELET_BODY(j, perf_arch, j->nimpl, archtype);
 
 	if (cl->model && cl->model->benchmarking)
 		calibrate_model = 1;

+ 1 - 1
src/drivers/driver_common/driver_common.h

@@ -25,7 +25,7 @@
 
 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
 			      struct timespec *codelet_start, int rank);
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch,
 			    struct timespec *codelet_end, int rank);
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
 					enum starpu_perf_archtype perf_arch,

+ 1 - 1
src/drivers/opencl/driver_opencl.c

@@ -597,7 +597,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 	STARPU_ASSERT(func);
 	func(task->interfaces, task->cl_arg);
 
-	_starpu_driver_end_job(args, j, &codelet_end, 0);
+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
 
 	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
 							&codelet_start, &codelet_end);

+ 2 - 2
src/profiling/bound.c

@@ -181,7 +181,7 @@ static void new_task(struct _starpu_job *j)
 	t->tag_id = j->task->tag_id;
 	t->use_tag = j->task->use_tag;
 	t->cl = j->task->cl;
-	t->footprint = _starpu_compute_buffers_footprint(j);
+	t->footprint = _starpu_compute_buffers_footprint(NULL, 0, 0, j);
 	t->priority = j->task->priority;
 	t->deps = NULL;
 	t->depsn = 0;
@@ -214,7 +214,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 	{
 		struct bound_task_pool *tp;
 
-		_starpu_compute_buffers_footprint(j);
+		_starpu_compute_buffers_footprint(NULL, 0, 0, j);
 
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 			tp = last;