exa2pro
/
starpu-max


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
							/* StarPU --- Runtime system for heterogeneous multicore architectures.
 *
 * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 *
 * StarPU is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or (at
 * your option) any later version.
 *
 * StarPU is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * See the GNU Lesser General Public License in COPYING.LGPL for more details.
 */

#include <starpu_config.h>
#include "sc_hypervisor_lp.h"
#include "sc_hypervisor_policy.h"
#include <math.h>
#include <sys/time.h>

struct ispeed_lp_data
{
	double **speed;
	double *flops;
	double **flops_on_w;
	int *workers;
};

#ifdef STARPU_HAVE_GLPK_H
static double _compute_workers_distrib(int ns, int nw, double final_w_in_s[ns][nw],
					unsigned is_integer, double tmax, void *specific_data)
{
	struct ispeed_lp_data *sd = (struct ispeed_lp_data *)specific_data;

	double **speed = sd->speed;
	double *flops = sd->flops;
	
	double **final_flops_on_w = sd->flops_on_w;
	
	return sc_hypervisor_lp_simulate_distrib_flops_on_sample(ns, nw, final_w_in_s, is_integer, tmax, speed, flops, final_flops_on_w);
}

static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double **flops_on_w, unsigned *sched_ctxs, int *workers)
{
	double *flops = (double*)malloc(ns*sizeof(double));
	double **speed = (double **)malloc(ns*sizeof(double*));
	int i;
	for(i = 0; i < ns; i++)
		speed[i] = (double*)malloc(nw*sizeof(double));
	
	int w,s;

	struct sc_hypervisor_wrapper* sc_w = NULL;
	for(s = 0; s < ns; s++)
	{
		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
		for(w = 0; w < nw; w++)
		{
			w_in_s[s][w] = 0.0;
			int worker = workers == NULL ? w : workers[w];

			speed[s][w] = sc_hypervisor_get_speed_per_worker(sc_w, worker);
			if(speed[s][w] == -1.0)
			{
				enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
				speed[s][w] = sc_hypervisor_get_speed(sc_w, arch);
				if(arch == STARPU_CUDA_WORKER)
				{
					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
					if(!worker_in_ctx)
					{
						double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)) / 1000;
						speed[s][w] = (speed[s][w] * transfer_speed) / (speed[s][w] + transfer_speed);
					}
				}

			}
			
//			printf("v[w%d][s%d] = %lf\n",w, s, speed[s][w]);
		}
		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
	}
	
	/* take the exec time of the slowest ctx 
	   as starting point and then try to minimize it
	   as increasing it a little for the faster ctxs */
	double tmax = sc_hypervisor_get_slowest_ctx_exec_time();
 	double smallest_tmax = sc_hypervisor_get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; 
//	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
	double tmin = 0.0;

        struct ispeed_lp_data specific_data;
        specific_data.speed = speed;
        specific_data.flops = flops;
        specific_data.flops_on_w = flops_on_w;
        specific_data.workers = workers;

        unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
								tmin, tmax, smallest_tmax, _compute_workers_distrib);

	for(i = 0; i < ns; i++)
		free(speed[i]);
	free(speed);
	
	return found_sol;
}

static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
{
        int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
	int nw = nworkers == -1 ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
        unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;

        struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, nw);
        int ntypes_of_workers = tw->nw;


	double w_in_s[ns][nw];

	double **flops_on_w = (double**)malloc(ns*sizeof(double*));
	int i;
	for(i = 0; i < ns; i++)
		flops_on_w[i] = (double*)malloc(nw*sizeof(double));

	struct timeval start_time;
	struct timeval end_time;
	gettimeofday(&start_time, NULL);
	unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, curr_sched_ctxs, workers);
	gettimeofday(&end_time, NULL);
	
	long diff_s = end_time.tv_sec  - start_time.tv_sec;
	long diff_us = end_time.tv_usec  - start_time.tv_usec;
	
	__attribute__((unused))	float timing = (float)(diff_s*1000000 + diff_us)/1000.0;

	/* if we did find at least one solution redistribute the resources */
	if(found_sol)
	{
		int w, s;
		double nworkers_per_ctx[ns][ntypes_of_workers];
		int nworkers_per_ctx_rounded[ns][ntypes_of_workers];
		for(s = 0; s < ns; s++)
		{
			for(w = 0; w < ntypes_of_workers; w++)
			{
				nworkers_per_ctx[s][w] = 0.0;
				nworkers_per_ctx_rounded[s][w] = 0;
			}
		}
		
		for(s = 0; s < ns; s++)
		{
			for(w = 0; w < nw; w++)
			{
				enum starpu_worker_archtype arch = starpu_worker_get_type(w);
		
				int idx = sc_hypervisor_get_index_for_arch(arch, tw);
				nworkers_per_ctx[s][idx] += w_in_s[s][w];
				if(arch == STARPU_CUDA_WORKER)
				{
					if(w_in_s[s][w] >= 0.3)
						nworkers_per_ctx_rounded[s][idx]++;
				}
				else
				{
					if(w_in_s[s][w] > 0.5)
						nworkers_per_ctx_rounded[s][idx]++;
				}
			}
		}
/* 				for(s = 0; s < ns; s++) */
/* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
/* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
		
		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, ntypes_of_workers, nworkers_per_ctx_rounded, nworkers_per_ctx, curr_sched_ctxs, tw);
	}
	free(tw);
	for(i = 0; i < ns; i++)
		free(flops_on_w[i]);
	free(flops_on_w);
}

static void ispeed_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx, __attribute__((unused))int worker, 
					__attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
{
        int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
        if(ret != EBUSY)
        {
                unsigned criteria = sc_hypervisor_get_resize_criteria();
                if(criteria != SC_NOTHING && criteria == SC_SPEED)
                {
                        if(sc_hypervisor_check_speed_gap_btw_ctxs(NULL, -1, NULL, -1))
                        {
                                _try_resizing(NULL, -1, NULL, -1);
                        }
                }
                STARPU_PTHREAD_MUTEX_UNLOCK(&act_hypervisor_mutex);
        }
}

static void ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
{
        int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
        if(ret != EBUSY)
        {
                unsigned criteria = sc_hypervisor_get_resize_criteria();
                if(criteria != SC_NOTHING && criteria == SC_IDLE)
                {

			if(sc_hypervisor_check_idle(sched_ctx, worker))
                        {
                                _try_resizing(NULL, -1, NULL, -1);
                        }
                }
                STARPU_PTHREAD_MUTEX_UNLOCK(&act_hypervisor_mutex);
        }
}

static void ispeed_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
{
	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
	if(ret != EBUSY)
	{
		_try_resizing(sched_ctxs, nsched_ctxs, workers, nworkers);
		STARPU_PTHREAD_MUTEX_UNLOCK(&act_hypervisor_mutex);
	}
}

static void ispeed_lp_end_ctx(__attribute__((unused))unsigned sched_ctx)
{
/* 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx); */
/* 	int worker; */
/* 	for(worker = 0; worker < 12; worker++) */
/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_speed[worker]); */

	return;
}

struct sc_hypervisor_policy ispeed_lp_policy = {
	.size_ctxs = NULL,
	.resize_ctxs = ispeed_lp_resize_ctxs,
	.handle_poped_task = ispeed_lp_handle_poped_task,
	.handle_pushed_task = NULL,
	.handle_idle_cycle = ispeed_lp_handle_idle_cycle,
	.handle_idle_end = NULL,
	.handle_post_exec_hook = NULL,
	.handle_submitted_job = NULL,
	.end_ctx = ispeed_lp_end_ctx,
	.init_worker = NULL,
	.custom = 0,
	.name = "ispeed_lp"
};

#endif /* STARPU_HAVE_GLPK_H */