Pārlūkot izejas kodu

properly initialize codelet_start/codelet_end timestamps
add check sequence for perfmodels to validate model parameters on model load and before store

Olivier Aumage 8 gadi atpakaļ
vecāks
revīzija
58d4d0dcd0

+ 20 - 7
src/core/perfmodel/perfmodel.c

@@ -181,6 +181,7 @@ void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 
 static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,  unsigned nimpl)
 {
+	double exp_perf = 0.0;
 	if (model)
 	{
 		_starpu_init_and_load_perfmodel(model);
@@ -190,24 +191,36 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 		switch (model->type)
 		{
 			case STARPU_PER_ARCH:
-				return per_arch_task_expected_perf(model, arch, task, nimpl);
+				exp_perf = per_arch_task_expected_perf(model, arch, task, nimpl);
+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
+				break;
 			case STARPU_COMMON:
-				return common_task_expected_perf(model, arch, task, nimpl);
+				exp_perf = common_task_expected_perf(model, arch, task, nimpl);
+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
+				break;
 			case STARPU_HISTORY_BASED:
-				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
+				exp_perf = _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
+				break;
 			case STARPU_REGRESSION_BASED:
-				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
+				exp_perf = _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
+				break;
 			case STARPU_NL_REGRESSION_BASED:
-				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
+				exp_perf = _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
+				break;
 			case STARPU_MULTIPLE_REGRESSION_BASED:
-				return _starpu_multiple_regression_based_job_expected_perf(model, arch, j, nimpl);
+				exp_perf = _starpu_multiple_regression_based_job_expected_perf(model, arch, j, nimpl);
+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
+				break;
 			default:
 				STARPU_ABORT();
 		}
 	}
 
 	/* no model was found */
-	return 0.0;
+	return exp_perf;
 }
 
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)

+ 156 - 2
src/core/perfmodel/perfmodel_history.c

@@ -242,6 +242,68 @@ static void insert_history_entry(struct starpu_perfmodel_history_entry *entry, s
 }
 
 #ifndef STARPU_SIMGRID
+static void check_reg_model(struct starpu_perfmodel *model, int comb, int impl)
+{
+	struct starpu_perfmodel_per_arch *per_arch_model;
+
+	per_arch_model = &model->state->per_arch[comb][impl];
+	struct starpu_perfmodel_regression_model *reg_model;
+	reg_model = &per_arch_model->regression;
+
+	/*
+	 * Linear Regression model
+	 */
+
+	/* Unless we have enough measurements, we put NaN in the file to indicate the model is invalid */
+	double alpha = nan(""), beta = nan("");
+	if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_NL_REGRESSION_BASED)
+	{
+		if (reg_model->nsample > 1)
+		{
+			alpha = reg_model->alpha;
+			beta = reg_model->beta;
+		}
+	}
+
+	/* TODO: check:
+	 * reg_model->sumlnx
+	 * reg_model->sumlnx2
+	 * reg_model->sumlny
+	 * reg_model->sumlnxlny
+	 * alpha
+	 * beta
+	 * reg_model->minx
+	 * reg_model->maxx
+	 */
+	STARPU_ASSERT(reg_model->nsample >= 0);
+	(void)alpha;
+	(void)beta;
+
+	/*
+	 * Non-Linear Regression model
+	 */
+
+	double a = nan(""), b = nan(""), c = nan("");
+
+	if (model->type == STARPU_NL_REGRESSION_BASED)
+		_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c);
+
+	/* TODO: check:
+	 * a
+	 * b
+	 * c
+	 */
+
+	/*
+	 * Multiple Regression Model
+	 */
+
+	if (model->type == STARPU_MULTIPLE_REGRESSION_BASED)
+	{
+		/* TODO: check: */
+	}
+}
+
 static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, int impl)
 {
 	struct starpu_perfmodel_per_arch *per_arch_model;
@@ -416,6 +478,15 @@ static void scan_reg_model(FILE *f, const char *path, struct starpu_perfmodel_re
 
 
 #ifndef STARPU_SIMGRID
+static void check_history_entry(struct starpu_perfmodel_history_entry *entry)
+{
+	STARPU_ASSERT(entry->deviation >= 0);
+	STARPU_ASSERT(entry->sum >= 0);
+	STARPU_ASSERT(entry->sum2 >= 0);
+	STARPU_ASSERT(entry->mean >= 0);
+	STARPU_ASSERT(entry->flops >= 0);
+	STARPU_ASSERT(entry->duration >= 0);
+}
 static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
 {
 	fprintf(f, "%08x\t%-15lu\t%-15e\t%-15e\t%-15e\t%-15e\t%-15e\t%u\n", entry->footprint, (unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
@@ -458,6 +529,11 @@ static void scan_history_entry(FILE *f, const char *path, struct starpu_perfmode
 
 	if (entry)
 	{
+		STARPU_ASSERT_MSG(flops >=0, "Negative flops %lf in performance model file %s", flops, path);
+		STARPU_ASSERT_MSG(mean >=0, "Negative mean %lf in performance model file %s", mean, path);
+		STARPU_ASSERT_MSG(deviation >=0, "Negative deviation %lf in performance model file %s", deviation, path);
+		STARPU_ASSERT_MSG(sum >=0, "Negative sum %lf in performance model file %s", sum, path);
+		STARPU_ASSERT_MSG(sum2 >=0, "Negative sum2 %lf in performance model file %s", sum2, path);
 		entry->footprint = footprint;
 		entry->size = size;
 		entry->flops = flops;
@@ -487,7 +563,7 @@ static void parse_per_arch_model_file(FILE *f, const char *path, struct starpu_p
 		struct starpu_perfmodel_history_entry *entry = NULL;
 		if (scan_history)
 		{
-			_STARPU_MALLOC(entry, sizeof(struct starpu_perfmodel_history_entry));
+			_STARPU_CALLOC(entry, 1, sizeof(struct starpu_perfmodel_history_entry));
 
 			/* Tell  helgrind that we do not care about
 			 * racing access to the sampling, we only want a
@@ -660,6 +736,43 @@ static int parse_model_file(FILE *f, const char *path, struct starpu_perfmodel *
 }
 
 #ifndef STARPU_SIMGRID
+static void check_per_arch_model(struct starpu_perfmodel *model, int comb, unsigned impl)
+{
+	struct starpu_perfmodel_per_arch *per_arch_model;
+
+	per_arch_model = &model->state->per_arch[comb][impl];
+	/* count the number of elements in the lists */
+	struct starpu_perfmodel_history_list *ptr = NULL;
+	unsigned nentries = 0;
+
+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
+	{
+		/* Dump the list of all entries in the history */
+		ptr = per_arch_model->list;
+		while(ptr)
+		{
+			nentries++;
+			ptr = ptr->next;
+		}
+	}
+
+	/* header */
+	char archname[32];
+	starpu_perfmodel_get_arch_name(arch_combs[comb], archname,  32, impl);
+	STARPU_ASSERT(strlen(archname)>0);
+	check_reg_model(model, comb, impl);
+
+	/* Dump the history into the model file in case it is necessary */
+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
+	{
+		ptr = per_arch_model->list;
+		while (ptr)
+		{
+			check_history_entry(ptr->entry);
+			ptr = ptr->next;
+		}
+	}
+}
 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, int comb, unsigned impl)
 {
 	struct starpu_perfmodel_per_arch *per_arch_model;
@@ -704,6 +817,39 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, in
 	fprintf(f, "\n");
 }
 
+static void check_model(struct starpu_perfmodel *model)
+{
+	int ncombs = model->state->ncombs;
+	STARPU_ASSERT(ncombs >= 0);
+
+	int i, impl, dev;
+	for(i = 0; i < ncombs; i++)
+	{
+		int comb = model->state->combs[i];
+		STARPU_ASSERT(comb >= 0);
+
+		int ndevices = arch_combs[comb]->ndevices;
+		STARPU_ASSERT(ndevices >= 1);
+
+		for(dev = 0; dev < ndevices; dev++)
+		{
+			STARPU_ASSERT(arch_combs[comb]->devices[dev].type >= 0);
+			STARPU_ASSERT(arch_combs[comb]->devices[dev].type <= 5);
+
+			STARPU_ASSERT(arch_combs[comb]->devices[dev].devid >= 0);
+
+			STARPU_ASSERT(arch_combs[comb]->devices[dev].ncores >= 1);
+		}
+
+		int nimpls = model->state->nimpls[comb];
+		STARPU_ASSERT(nimpls >= 1);
+		for (impl = 0; impl < nimpls; impl++)
+		{
+			check_per_arch_model(model, comb, impl);
+		}
+	}
+}
+
 static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 {
 	fprintf(f, "##################\n");
@@ -873,6 +1019,7 @@ static void save_history_based_model(struct starpu_perfmodel *model)
 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
 
 	locked = _starpu_fwrlock(f) == 0;
+	check_model(model);
 	_starpu_fftruncate(f, 0);
 	dump_model_file(f, model);
 	if (locked)
@@ -1423,6 +1570,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 	history = per_arch_model->history;
 	HASH_FIND_UINT32_T(history, &key, elt);
 	entry = (elt == NULL) ? NULL : elt->history_entry;
+	STARPU_ASSERT_MSG(!entry || entry->mean >= 0, "entry=%p, entry->mean=%lf\n", entry, entry?entry->mean:NAN);
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 
 	/* Here helgrind would shout that this is unprotected access.
@@ -1430,10 +1578,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 	 * a good-enough estimation */
 
 	if (entry && entry->nsample >= _starpu_calibration_minimum)
+	{
+		STARPU_ASSERT_MSG(entry->mean >= 0, "entry->mean=%lf\n", entry->mean);
 		/* TODO: report differently if we've scheduled really enough
 		 * of that task and the scheduler should perhaps put it aside */
 		/* Calibrated enough */
 		exp = entry->mean;
+	}
 
 docal:
 	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
@@ -1447,6 +1598,7 @@ docal:
 		model->benchmarking = 1;
 	}
 
+	STARPU_ASSERT_MSG(isnan(exp)||exp >= 0, "exp=%lf\n", exp);
 	return exp;
 }
 
@@ -1470,6 +1622,7 @@ int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch)
 
 void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl)
 {
+	STARPU_ASSERT_MSG(measured >= 0, "measured=%lf\n", measured);
 	if (model)
 	{
 		int c;
@@ -1526,7 +1679,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			if (!entry)
 			{
 				/* this is the first entry with such a footprint */
-				_STARPU_MALLOC(entry, sizeof(struct starpu_perfmodel_history_entry));
+				_STARPU_CALLOC(entry, 1, sizeof(struct starpu_perfmodel_history_entry));
 
 				/* Tell  helgrind that we do not care about
 				 * racing access to the sampling, we only want a
@@ -1645,6 +1798,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			_STARPU_MALLOC(entry->parameters, model->nparameters*sizeof(double));
 			model->parameters(j->task, entry->parameters);
 			entry->tag = j->task->tag_id;
+			STARPU_ASSERT(measured >= 0);
 			entry->duration = measured;
 
 			struct starpu_perfmodel_history_list *link;

+ 5 - 1
src/drivers/cpu/driver_cpu.c

@@ -55,7 +55,11 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 {
 	int is_parallel_task = (j->task_size > 1);
 	int profiling = starpu_profiling_status_get();
-	struct timespec codelet_start, codelet_end;
+	/* start/end timestamp are only conditionnally measured in
+	 * _starpu_driver_start_job/_end_job, thus make sure that they are
+	 * always initialized */
+	struct timespec codelet_start = {0,0};
+	struct timespec codelet_end = {0,0};
 
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;

+ 1 - 0
src/drivers/driver_common/driver_common.c

@@ -195,6 +195,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
 		starpu_timespec_sub(codelet_end, codelet_start, &measured_ts);
 		measured = starpu_timing_timespec_to_us(&measured_ts);
+		STARPU_ASSERT_MSG(measured >= 0, "measured=%lf\n", measured);
 
 		if (profiling && profiling_info)
 		{

+ 1 - 0
src/sched_policies/component_sched.c

@@ -73,6 +73,7 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 				{
 					continue;
 				}
+				STARPU_ASSERT_MSG(d >= 0, "workerid=%d, nimpl=%d, bundle=%p, d=%lf\n", workerid, nimpl, bundle, d);
 				if(d < len)
 				{
 					len = d;

+ 1 - 0
src/sched_policies/helper_mct.c

@@ -133,6 +133,7 @@ int starpu_mct_compute_expected_times(struct starpu_sched_component *component,
 				/* The perfmodel had been purged since the task was pushed
 				 * onto the mct component. */
 				continue;
+			STARPU_ASSERT_MSG(estimated_lengths[i]>=0, "component=%p, child[%d]=%p, estimated_lengths[%d]=%lf\n", component, i, c, i, estimated_lengths[i]);
 
 			/* Estimated availability of worker */
 			double estimated_end = c->estimated_end(c);