/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2011, 2012 INRIA * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include "sc_hypervisor_policy.h" static double _get_total_elapsed_flops_per_sched_ctx(unsigned sched_ctx) { struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx); double ret_val = 0.0; int i; for(i = 0; i < STARPU_NMAXWORKERS; i++) ret_val += sc_w->total_elapsed_flops[i]; return ret_val; } double _get_exp_end(unsigned sched_ctx) { struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(sched_ctx); double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w); if( elapsed_flops >= 1.0) { double curr_time = starpu_timing_now(); double elapsed_time = curr_time - sc_w->start_time; double exp_end = (elapsed_time * sc_w->remaining_flops / elapsed_flops) + curr_time; return exp_end; } return -1.0; } /* computes the instructions left to be executed out of the total instructions to execute */ double _get_flops_left_pct(unsigned sched_ctx) { struct sc_hypervisor_wrapper *wrapper = sc_hypervisor_get_wrapper(sched_ctx); double total_elapsed_flops = _get_total_elapsed_flops_per_sched_ctx(sched_ctx); if(wrapper->total_flops == total_elapsed_flops || total_elapsed_flops > wrapper->total_flops) return 0.0; return (wrapper->total_flops - total_elapsed_flops)/wrapper->total_flops; } /* select the workers needed to be moved in order to force the sender and the receiver context to finish simultaneously */ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *nworkers) { struct sc_hypervisor_wrapper* sender_sc_w = sc_hypervisor_get_wrapper(sender_sched_ctx); struct sc_hypervisor_wrapper* receiver_sc_w = sc_hypervisor_get_wrapper(receiver_sched_ctx); int *workers = NULL; double v_receiver = sc_hypervisor_get_ctx_speed(receiver_sc_w); double receiver_remainig_flops = receiver_sc_w->remaining_flops; double sender_exp_end = _get_exp_end(sender_sched_ctx); double sender_v_cpu = sc_hypervisor_get_speed_per_worker_type(sender_sc_w, STARPU_CPU_WORKER); double v_for_rctx = (receiver_remainig_flops/(sender_exp_end - starpu_timing_now())) - v_receiver; int nworkers_needed = v_for_rctx/sender_v_cpu; /* printf("%d->%d: v_rec %lf v %lf v_cpu %lf w_needed %d \n", sender_sched_ctx, receiver_sched_ctx, */ /* v_receiver, v_for_rctx, sender_v_cpu, nworkers_needed); */ if(nworkers_needed > 0) { struct sc_hypervisor_policy_config *sender_config = sc_hypervisor_get_config(sender_sched_ctx); int potential_moving_cpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CPU_WORKER); int potential_moving_gpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CUDA_WORKER); int sender_nworkers = (int)starpu_sched_ctx_get_nworkers(sender_sched_ctx); struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(receiver_sched_ctx); int nworkers_ctx = (int)starpu_sched_ctx_get_nworkers(receiver_sched_ctx); if(nworkers_needed < (potential_moving_cpus + 5 * potential_moving_gpus)) { if((sender_nworkers - nworkers_needed) >= sender_config->min_nworkers) { if((nworkers_ctx + nworkers_needed) > config->max_nworkers) nworkers_needed = nworkers_ctx > config->max_nworkers ? 0 : (config->max_nworkers - nworkers_ctx); if(nworkers_needed > 0) { int ngpus = nworkers_needed / 5; int *gpus; gpus = sc_hypervisor_get_idlest_workers(sender_sched_ctx, &ngpus, STARPU_CUDA_WORKER); int ncpus = nworkers_needed - ngpus; int *cpus; cpus = sc_hypervisor_get_idlest_workers(sender_sched_ctx, &ncpus, STARPU_CPU_WORKER); workers = (int*)malloc(nworkers_needed*sizeof(int)); int i; printf("%d: gpus: ", nworkers_needed); for(i = 0; i < ngpus; i++) { workers[(*nworkers)++] = gpus[i]; printf("%d ", gpus[i]); } printf(" cpus:"); for(i = 0; i < ncpus; i++) { workers[(*nworkers)++] = cpus[i]; printf("%d ", cpus[i]); } printf("\n"); free(gpus); free(cpus); } } } else { /*if the needed number of workers is to big we only move the number of workers corresponding to the granularity set by the user */ int nworkers_to_move = sc_hypervisor_compute_nworkers_to_move(sender_sched_ctx); if(sender_nworkers - nworkers_to_move >= sender_config->min_nworkers) { int nshared_workers = (int)starpu_sched_ctx_get_nshared_workers(sender_sched_ctx, receiver_sched_ctx); if((nworkers_ctx + nworkers_to_move - nshared_workers) > config->max_nworkers) nworkers_to_move = nworkers_ctx > config->max_nworkers ? 0 : (config->max_nworkers - nworkers_ctx + nshared_workers); if(nworkers_to_move > 0) { workers = sc_hypervisor_get_idlest_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER); *nworkers = nworkers_to_move; } } } } return workers; } static unsigned _gflops_rate_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize) { int ret = 1; if(force_resize) starpu_pthread_mutex_lock(&act_hypervisor_mutex); else ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex); if(ret != EBUSY) { int nworkers_to_move = 0; int *workers_to_move = _get_workers_to_move(sender_sched_ctx, receiver_sched_ctx, &nworkers_to_move); if(nworkers_to_move > 0) { sc_hypervisor_move_workers(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move, 0); struct sc_hypervisor_policy_config *new_config = sc_hypervisor_get_config(receiver_sched_ctx); int i; for(i = 0; i < nworkers_to_move; i++) new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] : new_config->new_workers_max_idle; free(workers_to_move); } starpu_pthread_mutex_unlock(&act_hypervisor_mutex); return 1; } return 0; } static int _find_fastest_sched_ctx() { unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs(); int nsched_ctxs = sc_hypervisor_get_nsched_ctxs(); double first_exp_end = _get_exp_end(sched_ctxs[0]); int fastest_sched_ctx = first_exp_end == -1.0 ? -1 : (int)sched_ctxs[0]; double curr_exp_end = 0.0; int i; for(i = 1; i < nsched_ctxs; i++) { curr_exp_end = _get_exp_end(sched_ctxs[i]); if((curr_exp_end < first_exp_end || first_exp_end == -1.0) && curr_exp_end != -1.0) { first_exp_end = curr_exp_end; fastest_sched_ctx = sched_ctxs[i]; } } return fastest_sched_ctx; } static int _find_slowest_sched_ctx() { unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs(); int nsched_ctxs = sc_hypervisor_get_nsched_ctxs(); int slowest_sched_ctx = -1; double curr_exp_end = 0.0; double last_exp_end = -1.0; int i; for(i = 0; i < nsched_ctxs; i++) { curr_exp_end = _get_exp_end(sched_ctxs[i]); /*if it hasn't started bc of no ressources give it priority */ if(curr_exp_end == -1.0) return sched_ctxs[i]; if( curr_exp_end > last_exp_end) { slowest_sched_ctx = sched_ctxs[i]; last_exp_end = curr_exp_end; } } return slowest_sched_ctx; } static int _find_slowest_available_sched_ctx(unsigned sched_ctx) { unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs(); int nsched_ctxs = sc_hypervisor_get_nsched_ctxs(); int slowest_sched_ctx = -1; double curr_exp_end = 0.0; double last_exp_end = -1.0; int i; for(i = 0; i < nsched_ctxs; i++) { if(sched_ctxs[i] != sched_ctx) { curr_exp_end = _get_exp_end(sched_ctxs[i]); /*if it hasn't started bc of no ressources give it priority */ if(curr_exp_end == -1.0) return sched_ctxs[i]; if(last_exp_end < curr_exp_end) { slowest_sched_ctx = sched_ctxs[i]; last_exp_end = curr_exp_end; } } } return slowest_sched_ctx; } static void gflops_rate_resize(unsigned sched_ctx) { _get_exp_end(sched_ctx); double flops_left_pct = _get_flops_left_pct(sched_ctx); /* if the context finished all the instructions it had to execute we move all the resources to the slowest context */ if(flops_left_pct == 0.0f) { int slowest_sched_ctx = _find_slowest_available_sched_ctx(sched_ctx); if(slowest_sched_ctx != -1) { double slowest_flops_left_pct = _get_flops_left_pct(slowest_sched_ctx); if(slowest_flops_left_pct != 0.0f) { struct sc_hypervisor_policy_config* config = sc_hypervisor_get_config(sched_ctx); config->min_nworkers = 0; config->max_nworkers = 0; printf("ctx %d finished & gives away the res to %d; slow_left %lf\n", sched_ctx, slowest_sched_ctx, slowest_flops_left_pct); sc_hypervisor_policy_resize(sched_ctx, slowest_sched_ctx, 1, 1); sc_hypervisor_stop_resize(slowest_sched_ctx); } } } int fastest_sched_ctx = _find_fastest_sched_ctx(); int slowest_sched_ctx = _find_slowest_sched_ctx(); if(fastest_sched_ctx != -1 && slowest_sched_ctx != -1 && fastest_sched_ctx != slowest_sched_ctx) { double fastest_exp_end = _get_exp_end(fastest_sched_ctx); double slowest_exp_end = _get_exp_end(slowest_sched_ctx); if((slowest_exp_end == -1.0 && fastest_exp_end != -1.0) || ((fastest_exp_end + (fastest_exp_end*0.5)) < slowest_exp_end )) { double fast_flops_left_pct = _get_flops_left_pct(fastest_sched_ctx); if(fast_flops_left_pct < 0.8) { struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(slowest_sched_ctx); double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w); if((elapsed_flops/sc_w->total_flops) > 0.1) _gflops_rate_resize(fastest_sched_ctx, slowest_sched_ctx, 0); } } } } static void gflops_rate_handle_poped_task(unsigned sched_ctx, __attribute__((unused)) int worker, __attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint) { gflops_rate_resize(sched_ctx); } struct sc_hypervisor_policy gflops_rate_policy = { .size_ctxs = NULL, .resize_ctxs = NULL, .handle_poped_task = gflops_rate_handle_poped_task, .handle_pushed_task = NULL, .handle_idle_cycle = NULL, .handle_idle_end = NULL, .handle_post_exec_hook = NULL, .handle_submitted_job = NULL, .end_ctx = NULL, .custom = 0, .name = "gflops_rate" };