14 lat temu · bbe5e013e1
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -202,15 +202,15 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 	   there is no performance prediction available yet */
			
 
				 	int forced_best = -1;
			
 
				 
			
 
				-	double local_task_length[nworkers+ncombinedworkers];
			
 
				-	double local_data_penalty[nworkers+ncombinedworkers];
			
 
				-	double local_power[nworkers+ncombinedworkers];
			
 
				-	double local_exp_end[nworkers+ncombinedworkers];
			
 
				-	double fitness[nworkers+ncombinedworkers];
			
 
				+	double local_task_length[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_data_penalty[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_power[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_exp_end[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double fitness[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	double max_exp_end = 0.0;
			
 
				 
			
 
				-	int skip_worker[nworkers+ncombinedworkers];
			
 
				+	int skip_worker[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	double best_exp_end = DBL_MAX;
			
 
				 	//double penality_best = 0.0;
			
@@ -240,38 +240,38 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				-				skip_worker[worker] = 1;
			
 
				+				skip_worker[worker][nimpl] = 1;
			
 
				 				continue;
			
 
				 			}
			
 
				 			else {
			
 
				-				skip_worker[worker] = 0;
			
 
				+				skip_worker[worker][nimpl] = 0;
			
 
				 			}
			
 
				 
			
 
				 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 
			
 
				-			local_task_length[worker] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				+			local_task_length[worker][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
 
				 			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+			local_data_penalty[worker][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				 			double ntasks_end = compute_ntasks_end(worker);
			
 
				 
			
 
				 			if (ntasks_best == -1
			
 
				 					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					|| (!calibrating && local_task_length[worker][nimpl] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+					|| (calibrating && local_task_length[worker][nimpl] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				 					) {
			
 
				 				ntasks_best_end = ntasks_end;
			
 
				 				ntasks_best = worker;
			
 
				 			}
			
 
				 
			
 
				-			if (local_task_length[worker] == -1.0)
			
 
				+			if (local_task_length[worker][nimpl] == -1.0)
			
 
				 				/* we are calibrating, we want to speed-up calibration time
			
 
				 				 * so we privilege non-calibrated tasks (but still
			
 
				 				 * greedily distribute them to avoid dumb schedules) */
			
 
				 				calibrating = 1;
			
 
				 
			
 
				-			if (local_task_length[worker] <= 0.0)
			
 
				+			if (local_task_length[worker][nimpl] <= 0.0)
			
 
				 				/* there is no prediction available for that task
			
 
				 				 * with that arch yet, so switch to a greedy strategy */
			
 
				 				unknown = 1;
			
@@ -279,23 +279,23 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 			if (unknown)
			
 
				 				continue;
			
 
				 
			
 
				-			local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
			
 
				+			local_exp_end[worker][nimpl] = compute_expected_end(worker, local_task_length[worker][nimpl]);
			
 
				 
			
 
				-			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
			
 
				+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker][nimpl], local_exp_end[worker][nimpl]);
			
 
				 
			
 
				-			if (local_exp_end[worker] < best_exp_end)
			
 
				+			if (local_exp_end[worker][nimpl] < best_exp_end)
			
 
				 			{
			
 
				 				/* a better solution was found */
			
 
				-				best_exp_end = local_exp_end[worker];
			
 
				+				best_exp_end = local_exp_end[worker][nimpl];
			
 
				 				best_impl = nimpl;
			
 
				 			}
			
 
				 
			
 
				 
			
 
				-			local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
			
 
				-			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
			
 
				+			local_power[worker][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
			
 
				+			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker][nimpl],local_power[worker][nimpl],worker,nimpl);
			
 
				 
			
 
				-			if (local_power[worker] == -1.0)
			
 
				-				local_power[worker] = 0.;
			
 
				+			if (local_power[worker][nimpl] == -1.0)
			
 
				+				local_power[worker][nimpl] = 0.;
			
 
				 
			
 
				 		} //end for
			
 
				 	}
			
@@ -311,30 +311,30 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 		for (worker = 0; worker < nworkers+ncombinedworkers; worker++)
			
 
				 		{
			
 
				 
			
 
				-			if (skip_worker[worker])
			
 
				+			if (skip_worker[worker][nimpl])
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				 				continue;
			
 
				 			}
			
 
				 	
			
 
				-			fitness[worker] = alpha*(local_exp_end[worker] - best_exp_end) 
			
 
				-					+ beta*(local_data_penalty[worker])
			
 
				-					+ _gamma*(local_power[worker]);
			
 
				+			fitness[worker][nimpl] = alpha*(local_exp_end[worker][nimpl] - best_exp_end) 
			
 
				+					+ beta*(local_data_penalty[worker][nimpl])
			
 
				+					+ _gamma*(local_power[worker][nimpl]);
			
 
				 
			
 
				-			if (local_exp_end[worker] > max_exp_end)
			
 
				+			if (local_exp_end[worker][nimpl] > max_exp_end)
			
 
				 				/* This placement will make the computation
			
 
				 				 * longer, take into account the idle
			
 
				 				 * consumption of other cpus */
			
 
				-				fitness[worker] += _gamma * idle_power * (local_exp_end[worker] - max_exp_end) / 1000000.0;
			
 
				+				fitness[worker][nimpl] += _gamma * idle_power * (local_exp_end[worker][nimpl] - max_exp_end) / 1000000.0;
			
 
				 
			
 
				-			if (best == -1 || fitness[worker] < best_fitness)
			
 
				+			if (best == -1 || fitness[worker][nimpl] < best_fitness)
			
 
				 			{
			
 
				 				/* we found a better solution */
			
 
				-				best_fitness = fitness[worker];
			
 
				+				best_fitness = fitness[worker][nimpl];
			
 
				 				best = worker;
			
 
				 			}
			
 
				 
			
 
				-		//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker], local_exp_end[worker] - best_exp_end, local_data_penalty[worker]);
			
 
				+		//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker][nimpl], local_exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl]);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -347,12 +347,12 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 		 * so we force this measurement */
			
 
				 		best = forced_best;
			
 
				 		//penality_best = 0.0;
			
 
				-		best_exp_end = local_exp_end[best];
			
 
				+		best_exp_end = local_exp_end[best][nimpl];
			
 
				 	}
			
 
				 	else 
			
 
				 	{
			
 
				-                //penality_best = local_data_penalty[best];
			
 
				-		best_exp_end = local_exp_end[best];
			
 
				+		//penality_best = local_data_penalty[best][nimpl];
			
 
				+		best_exp_end = local_exp_end[best][nimpl];
			
 
				 	}