il y a 14 ans · 37862069c7
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -346,7 +346,7 @@ struct starpu_codelet cl = @{
 
																 @end cartouche
															
 
																 @item
															
 
																-Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
															
 
																+Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
															
 
																 model type). This still assumes performance regularity, but can work
															
 
																 with various data input sizes, by applying regression over observed
															
 
																 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
															
@@ -374,6 +374,14 @@ per architecture.
 
																 @end itemize
															
 
																+For the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},
															
 
																+the total size of task data (both input and output) is used as an index by
															
 
																+default. The @code{size_base} field of @code{struct starpu_perfmodel} however
															
 
																+permits the application to override that, when for instance some of the data
															
 
																+do not matter for task cost (e.g. mere reference table), or when using sparse
															
 
																+structures (in which case it is the number of non-zeros which matter), or when
															
 
																+there is some hidden parameter such as the number of iterations, etc.
															
 
																+
															
 
																 How to use schedulers which can benefit from such performance model is explained
															
 
																 in @ref{Task scheduling policy}.
															
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -1460,6 +1460,9 @@ used as file name to store the model.
 
																 implementation number, and must return a task duration estimation in micro-seconds.
															
 
																 @item @code{per_arch}: Used by @code{STARPU_PER_ARCH}: array of @code{struct
															
 
																 starpu_per_arch_perfmodel} structures.
															
 
																+@item @code{size_base}: Used by @code{STARPU_HISTORY_BASED} and
															
 
																+@code{STARPU_*REGRESSION_BASED}. If not NULL, takes a task and implementation
															
 
																+number, and returns the size to be used as index for history and regression.
															
 
																 @end table
															
 
																 @end deftp
															
@@ -1479,6 +1482,8 @@ records all execution history measures.
 
																 Used by @code{STARPU_HISTORY_REGRESION_BASED} and
															
 
																 @code{STARPU_NL_REGRESSION_BASED}, contains the estimated factors of the
															
 
																 regression.
															
 
																+@item @code{size_base}: Same as in @code{struct perfmodel}, but per-arch, in
															
 
																+case it depends on the architecture-specific implementation.
															
 
																 @end table
															
 
																 @end deftp
															
--- a/examples/opt/pi/pi.c
+++ b/examples/opt/pi/pi.c
@@ -61,6 +61,12 @@ static void cpu_kernel(void *descr[], void *cl_arg)
 
																 	free(random_numbers);
															
 
																 }
															
 
																+/* The amount of work does not depend on the data size at all :) */
															
 
																+static size_t size_base(struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+	return NSHOT_PER_TASK;
															
 
																+}
															
 
																+
															
 
																 static void parse_args(int argc, char **argv)
															
 
																 {
															
 
																 	int i;
															
@@ -114,6 +120,7 @@ int main(int argc, char **argv)
 
																 	static struct starpu_perfmodel model =
															
 
																 	{
															
 
																 		.type = STARPU_HISTORY_BASED,
															
 
																+		.size_base = size_base,
															
 
																 		.symbol = "monte_carlo_pi"
															
 
																 	};
															
--- a/examples/opt/pi/pi_redux.c
+++ b/examples/opt/pi/pi_redux.c
@@ -189,6 +189,19 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 
																 }
															
 
																 #endif
															
 
																+/* The amount of work does not depend on the data size at all :) */
															
 
																+static size_t size_base(struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+	return NSHOT_PER_TASK;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_perfmodel model =
															
 
																+{
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.size_base = size_base,
															
 
																+	.symbol = "monte_carlo_pi_redux"
															
 
																+};
															
 
																+
															
 
																 static struct starpu_codelet pi_cl =
															
 
																 {
															
 
																 	.where =
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -135,6 +135,7 @@ struct starpu_per_arch_perfmodel
 
																 {
															
 
																 	double (*cost_model)(struct starpu_buffer_descr *t) STARPU_DEPRECATED; /* returns expected duration in µs */
															
 
																 	double (*cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl); /* returns expected duration in µs */
															
 
																+	size_t (*size_base)(struct starpu_task *, enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																 	/* internal variables */
															
 
																 	struct starpu_htbl32_node *history;
															
@@ -163,6 +164,8 @@ struct starpu_perfmodel
 
																 	double (*cost_model)(struct starpu_buffer_descr *) STARPU_DEPRECATED;
															
 
																 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
															
 
																+	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
															
 
																+
															
 
																 	/* per-architecture model */
															
 
																 	struct starpu_per_arch_perfmodel per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
															
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -199,11 +199,11 @@ do {									\
 
																 	}								\
															
 
																 } while(0);
															
 
																-#define _STARPU_TRACE_END_CODELET_BODY(job, archtype)			\
															
 
																+#define _STARPU_TRACE_END_CODELET_BODY(job, perf_arch, nimpl, archtype)			\
															
 
																 do {									\
															
 
																-	const size_t job_size = _starpu_job_get_data_size((job));	\
															
 
																-	const uint32_t job_hash = _starpu_compute_buffers_footprint(job);\
															
 
																-	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, job, (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
															
 
																+	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
															
 
																+	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
															
 
																+	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
															
 
																 } while(0);
															
 
																 #define _STARPU_TRACE_START_CALLBACK(job)	\
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -27,22 +27,26 @@
 
																 #include <profiling/bound.h>
															
 
																 #include <starpu_top.h>
															
 
																-size_t _starpu_job_get_data_size(struct _starpu_job *j)
															
 
																+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
															
 
																 {
															
 
																-	size_t size = 0;
															
 
																-
															
 
																 	struct starpu_task *task = j->task;
															
 
																-	unsigned nbuffers = task->cl->nbuffers;
															
 
																-
															
 
																-	unsigned buffer;
															
 
																-	for (buffer = 0; buffer < nbuffers; buffer++)
															
 
																-	{
															
 
																-		starpu_data_handle_t handle = task->handles[buffer];
															
 
																-		size += _starpu_data_get_size(handle);
															
 
																+	if (model && model->per_arch[arch][nimpl].size_base) {
															
 
																+		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
															
 
																+	} else if (model && model->size_base) {
															
 
																+		return model->size_base(task, nimpl);
															
 
																+	} else {
															
 
																+		unsigned nbuffers = task->cl->nbuffers;
															
 
																+		size_t size = 0;
															
 
																+
															
 
																+		unsigned buffer;
															
 
																+		for (buffer = 0; buffer < nbuffers; buffer++)
															
 
																+		{
															
 
																+			starpu_data_handle_t handle = task->handles[buffer];
															
 
																+			size += _starpu_data_get_size(handle);
															
 
																+		}
															
 
																+		return size;
															
 
																 	}
															
 
																-
															
 
																-	return size;
															
 
																 }
															
 
																 /* we need to identify each task to generate the DAG. */
															
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
@@ -151,7 +151,7 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j, unsigned
 
																 void _starpu_handle_job_termination(struct _starpu_job *j, unsigned job_is_already_locked);
															
 
																 /* Get the sum of the size of the data accessed by the job. */
															
 
																-size_t _starpu_job_get_data_size(struct _starpu_job *j);
															
 
																+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
															
 
																 /* Get a task from the local pool of tasks that were explicitly attributed to
															
 
																  * that worker. */
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -893,7 +893,7 @@ void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 
																 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl)
															
 
																 {
															
 
																 	double exp = -1.0;
															
 
																-	size_t size = _starpu_job_get_data_size(j);
															
 
																+	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
															
 
																 	struct starpu_regression_model *regmodel;
															
 
																 	regmodel = &model->per_arch[arch][nimpl].regression;
															
@@ -907,7 +907,7 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 
																 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j,unsigned nimpl)
															
 
																 {
															
 
																 	double exp = -1.0;
															
 
																-	size_t size = _starpu_job_get_data_size(j);
															
 
																+	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
															
 
																 	struct starpu_regression_model *regmodel;
															
 
																 	regmodel = &model->per_arch[arch][nimpl].regression;
															
@@ -916,7 +916,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
																 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
															
 
																 	else
															
 
																 	{
															
 
																-		uint32_t key = _starpu_compute_buffers_footprint(j);
															
 
																+		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
															
 
																 		struct starpu_per_arch_perfmodel *per_arch_model = &model->per_arch[arch][nimpl];
															
 
																 		struct starpu_htbl32_node *history = per_arch_model->history;
															
 
																 		struct starpu_history_entry *entry;
															
@@ -945,7 +945,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
																 	struct starpu_history_entry *entry;
															
 
																 	struct starpu_htbl32_node *history;
															
 
																-	uint32_t key = _starpu_compute_buffers_footprint(j);
															
 
																+	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
															
 
																 	per_arch_model = &model->per_arch[arch][nimpl];
															
@@ -989,7 +989,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 			struct starpu_htbl32_node *history;
															
 
																 			struct starpu_htbl32_node **history_ptr;
															
 
																 			struct starpu_history_list **list;
															
 
																-			uint32_t key = _starpu_compute_buffers_footprint(j);
															
 
																+			uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
															
 
																 			history = per_arch_model->history;
															
 
																 			history_ptr = &per_arch_model->history;
															
@@ -1008,7 +1008,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 					entry->deviation = 0.0;
															
 
																 					entry->sum2 = measured*measured;
															
 
																-					entry->size = _starpu_job_get_data_size(j);
															
 
																+					entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
															
 
																 					entry->footprint = key;
															
 
																 					entry->nsample = 1;
															
@@ -1037,7 +1037,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 			reg_model = &per_arch_model->regression;
															
 
																 			/* update the regression model */
															
 
																-			size_t job_size = _starpu_job_get_data_size(j);
															
 
																+			size_t job_size = _starpu_job_get_data_size(model, arch, nimpl, j);
															
 
																 			double logy, logx;
															
 
																 			logx = log((double)job_size);
															
 
																 			logy = log(measured);
															
@@ -1069,11 +1069,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 		FILE * debug_file = per_arch_model->debug_file;
															
 
																 		if (!j->footprint_is_computed)
															
 
																-			(void) _starpu_compute_buffers_footprint(j);
															
 
																+			(void) _starpu_compute_buffers_footprint(model, arch, nimpl, j);
															
 
																 		STARPU_ASSERT(j->footprint_is_computed);
															
 
																-		fprintf(debug_file, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(j), measured, task->predicted, task->predicted_transfer, cpuid);
															
 
																+		fprintf(debug_file, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, nimpl, j), measured, task->predicted, task->predicted_transfer, cpuid);
															
 
																 		unsigned i;
															
 
																 		for (i = 0; i < task->cl->nbuffers; i++)
															
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -18,7 +18,7 @@
 
																 #include <datawizard/footprint.h>
															
 
																 #include <common/hash.h>
															
 
																-uint32_t _starpu_compute_buffers_footprint(struct _starpu_job *j)
															
 
																+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
															
 
																 {
															
 
																 	if (j->footprint_is_computed)
															
 
																 		return j->footprint;
															
@@ -28,13 +28,21 @@ uint32_t _starpu_compute_buffers_footprint(struct _starpu_job *j)
 
																 	struct starpu_task *task = j->task;
															
 
																-	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
															
 
																-	{
															
 
																-		starpu_data_handle_t handle = task->handles[buffer];
															
 
																-
															
 
																-		uint32_t handle_footprint = _starpu_data_get_footprint(handle);
															
 
																-
															
 
																-		footprint = _starpu_crc32_be(handle_footprint, footprint);
															
 
																+	if (model && model->per_arch[arch][nimpl].size_base) {
															
 
																+		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
															
 
																+		footprint = _starpu_crc32_be_n(&size, sizeof(size), footprint);
															
 
																+	} else if (model && model->size_base) {
															
 
																+		size_t size = model->size_base(task, nimpl);
															
 
																+		footprint = _starpu_crc32_be_n(&size, sizeof(size), footprint);
															
 
																+	} else {
															
 
																+		for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
															
 
																+		{
															
 
																+			starpu_data_handle_t handle = task->handles[buffer];
															
 
																+
															
 
																+			uint32_t handle_footprint = _starpu_data_get_footprint(handle);
															
 
																+
															
 
																+			footprint = _starpu_crc32_be(handle_footprint, footprint);
															
 
																+		}
															
 
																 	}
															
 
																 	j->footprint = footprint;
															
--- a/src/datawizard/footprint.h
+++ b/src/datawizard/footprint.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -24,7 +24,7 @@
 
																 /* Compute the footprint that characterizes the job and cache it into the job
															
 
																  * structure. */
															
 
																-uint32_t _starpu_compute_buffers_footprint(struct _starpu_job *j);
															
 
																+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
															
 
																 /* Compute the footprint that characterizes the layout of the data handle. */
															
 
																 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -60,7 +60,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 
																 		func(task->interfaces, task->cl_arg);
															
 
																 	}
															
 
																-	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);
															
 
																+	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank);
															
 
																 	if (is_parallel_task)
															
 
																 		_STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -221,7 +221,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
																 	STARPU_ASSERT(func);
															
 
																 	func(task->interfaces, task->cl_arg);
															
 
																-	_starpu_driver_end_job(args, j, &codelet_end, 0);
															
 
																+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
															
 
																 	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
															
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -60,7 +60,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 
																 	_STARPU_TRACE_START_CODELET_BODY(j);
															
 
																 }
															
 
																-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_end, int rank)
															
 
																+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch, struct timespec *codelet_end, int rank)
															
 
																 {
															
 
																 	struct starpu_task *task = j->task;
															
 
																 	struct starpu_codelet *cl = task->cl;
															
@@ -71,7 +71,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 
																 	unsigned calibrate_model = 0;
															
 
																 	enum starpu_perf_archtype archtype STARPU_ATTRIBUTE_UNUSED = args->perf_arch;
															
 
																-	_STARPU_TRACE_END_CODELET_BODY(j, archtype);
															
 
																+	_STARPU_TRACE_END_CODELET_BODY(j, perf_arch, j->nimpl, archtype);
															
 
																 	if (cl->model && cl->model->benchmarking)
															
 
																 		calibrate_model = 1;
															
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -25,7 +25,7 @@
 
																 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
															
 
																 			      struct timespec *codelet_start, int rank);
															
 
																-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
															
 
																+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch,
															
 
																 			    struct timespec *codelet_end, int rank);
															
 
																 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
															
 
																 					enum starpu_perf_archtype perf_arch,
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -597,7 +597,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
																 	STARPU_ASSERT(func);
															
 
																 	func(task->interfaces, task->cl_arg);
															
 
																-	_starpu_driver_end_job(args, j, &codelet_end, 0);
															
 
																+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
															
 
																 	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
															
 
																 							&codelet_start, &codelet_end);
															
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -181,7 +181,7 @@ static void new_task(struct _starpu_job *j)
 
																 	t->tag_id = j->task->tag_id;
															
 
																 	t->use_tag = j->task->use_tag;
															
 
																 	t->cl = j->task->cl;
															
 
																-	t->footprint = _starpu_compute_buffers_footprint(j);
															
 
																+	t->footprint = _starpu_compute_buffers_footprint(NULL, 0, 0, j);
															
 
																 	t->priority = j->task->priority;
															
 
																 	t->deps = NULL;
															
 
																 	t->depsn = 0;
															
@@ -214,7 +214,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 
																 	{
															
 
																 		struct bound_task_pool *tp;
															
 
																-		_starpu_compute_buffers_footprint(j);
															
 
																+		_starpu_compute_buffers_footprint(NULL, 0, 0, j);
															
 
																 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
															
 
																 			tp = last;