123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623 |
- /*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
- #include <unistd.h>
- #include <sys/time.h>
- #include <stdlib.h>
- #include <starpu.h>
- #include <common/config.h>
- #include <core/workers.h>
- #include <core/perfmodel/perfmodel.h>
- #define SIZE (32*1024*1024*sizeof(char))
- #define NITER 128
- #define MAXCPUS 32
- struct cudadev_timing {
- int cpu_id;
- double timing_htod;
- double timing_dtoh;
- };
- static double bandwith_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
- static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
- static unsigned was_benchmarked = 0;
- static int ncuda = 0;
- static int affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
- /* Benchmarking the performance of the bus */
- #ifdef USE_CUDA
- static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
- static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
- static struct cudadev_timing cudadev_timing_per_cpu[STARPU_MAXNODES][MAXCPUS];
- static void measure_bandwith_between_host_and_dev_on_cpu(int dev, int cpu)
- {
- struct machine_config_s *config = _starpu_get_machine_config();
- _starpu_bind_thread_on_cpu(config, cpu);
- /* Initiliaze CUDA context on the device */
- cudaSetDevice(dev);
- /* hack to avoid third party libs to rebind threads */
- _starpu_bind_thread_on_cpu(config, cpu);
- /* hack to force the initialization */
- cudaFree(0);
- /* hack to avoid third party libs to rebind threads */
- _starpu_bind_thread_on_cpu(config, cpu);
- /* Allocate a buffer on the device */
- unsigned char *d_buffer;
- cudaMalloc((void **)&d_buffer, SIZE);
- assert(d_buffer);
- /* hack to avoid third party libs to rebind threads */
- _starpu_bind_thread_on_cpu(config, cpu);
- /* Allocate a buffer on the host */
- unsigned char *h_buffer;
- cudaHostAlloc((void **)&h_buffer, SIZE, 0);
- assert(h_buffer);
- /* hack to avoid third party libs to rebind threads */
- _starpu_bind_thread_on_cpu(config, cpu);
- /* Fill them */
- memset(h_buffer, 0, SIZE);
- cudaMemset(d_buffer, 0, SIZE);
- /* hack to avoid third party libs to rebind threads */
- _starpu_bind_thread_on_cpu(config, cpu);
- unsigned iter;
- double timing;
- struct timeval start;
- struct timeval end;
- cudadev_timing_per_cpu[dev+1][cpu].cpu_id = cpu;
- /* Measure upload bandwith */
- gettimeofday(&start, NULL);
- for (iter = 0; iter < NITER; iter++)
- {
- cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
- cudaThreadSynchronize();
- }
- gettimeofday(&end, NULL);
- timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
- cudadev_timing_per_cpu[dev+1][cpu].timing_htod = timing/NITER;
- /* Measure download bandwith */
- gettimeofday(&start, NULL);
- for (iter = 0; iter < NITER; iter++)
- {
- cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
- cudaThreadSynchronize();
- }
- gettimeofday(&end, NULL);
- timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
- cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh = timing/NITER;
- /* Free buffers */
- cudaFreeHost(h_buffer);
- cudaFree(d_buffer);
- cudaThreadExit();
- }
- /* NB: we want to sort the bandwith by DECREASING order */
- int compar_cudadev_timing(const void *left_cudadev_timing, const void *right_cudadev_timing)
- {
- const struct cudadev_timing *left = left_cudadev_timing;
- const struct cudadev_timing *right = right_cudadev_timing;
-
- double left_dtoh = left->timing_dtoh;
- double left_htod = left->timing_htod;
- double right_dtoh = right->timing_dtoh;
- double right_htod = right->timing_htod;
-
- double bandwith_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
- double bandwith_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
- /* it's for a decreasing sorting */
- return (bandwith_sum2_left < bandwith_sum2_right);
- }
- static void measure_bandwith_between_host_and_dev(int dev, unsigned ncores)
- {
- unsigned core;
- for (core = 0; core < ncores; core++)
- {
- measure_bandwith_between_host_and_dev_on_cpu(dev, core);
- }
- /* sort the results */
- qsort(cudadev_timing_per_cpu[dev+1], ncores,
- sizeof(struct cudadev_timing),
- compar_cudadev_timing);
-
- #ifdef VERBOSE
- for (core = 0; core < ncores; core++)
- {
- unsigned current_core = cudadev_timing_per_cpu[dev+1][core].cpu_id;
- double bandwith_dtoh = cudadev_timing_per_cpu[dev+1][core].timing_dtoh;
- double bandwith_htod = cudadev_timing_per_cpu[dev+1][core].timing_htod;
- double bandwith_sum2 = bandwith_dtoh*bandwith_dtoh + bandwith_htod*bandwith_htod;
- fprintf(stderr, "BANDWITH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_core, bandwith_htod, bandwith_dtoh, sqrt(bandwith_sum2));
- }
- unsigned best_core = cudadev_timing_per_cpu[dev+1][0].cpu_id;
- fprintf(stderr, "BANDWITH GPU %d BEST CPU %d\n", dev, best_core);
- #endif
- /* The results are sorted in a decreasing order, so that the best
- * measurement is currently the first entry. */
- cudadev_timing_dtoh[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_dtoh;
- cudadev_timing_htod[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_htod;
- }
- #endif
- static void benchmark_all_cuda_devices(void)
- {
- int ret;
- #ifdef VERBOSE
- fprintf(stderr, "Benchmarking the speed of the bus\n");
- #endif
- /* Save the current cpu binding */
- cpu_set_t former_process_affinity;
- ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
- if (ret)
- {
- perror("sched_getaffinity");
- STARPU_ABORT();
- }
- #ifdef USE_CUDA
- struct machine_config_s *config = _starpu_get_machine_config();
- unsigned ncores = _starpu_topology_get_nhwcore(config);
- cudaGetDeviceCount(&ncuda);
- int i;
- for (i = 0; i < ncuda; i++)
- {
- /* measure bandwith between Host and Device i */
- measure_bandwith_between_host_and_dev(i, ncores);
- }
- #endif
- was_benchmarked = 1;
- /* Restore the former affinity */
- ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
- if (ret)
- {
- perror("sched_setaffinity");
- STARPU_ABORT();
- }
- #ifdef VERBOSE
- fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
- #endif
- }
- static void get_bus_path(const char *type, char *path, size_t maxlen)
- {
- _starpu_get_perf_model_dir_bus(path, maxlen);
- strncat(path, type, maxlen);
-
- char hostname[32];
- gethostname(hostname, 32);
- strncat(path, ".", maxlen);
- strncat(path, hostname, maxlen);
- }
- /*
- * Affinity
- */
- static void get_affinity_path(char *path, size_t maxlen)
- {
- get_bus_path("affinity", path, maxlen);
- }
- static void load_bus_affinity_file_content(void)
- {
- FILE *f;
- char path[256];
- get_affinity_path(path, 256);
- f = fopen(path, "r");
- STARPU_ASSERT(f);
- #ifdef USE_CUDA
- struct machine_config_s *config = _starpu_get_machine_config();
- unsigned ncores = _starpu_topology_get_nhwcore(config);
- cudaGetDeviceCount(&ncuda);
- int gpu;
- for (gpu = 0; gpu < ncuda; gpu++)
- {
- int ret;
- int dummy;
- ret = fscanf(f, "%d\t", &dummy);
- STARPU_ASSERT(ret == 1);
- STARPU_ASSERT(dummy == gpu);
- unsigned core;
- for (core = 0; core < ncores; core++)
- {
- ret = fscanf(f, "%d\t", &affinity_matrix[gpu][core]);
- STARPU_ASSERT(ret == 1);
- }
- ret = fscanf(f, "\n");
- STARPU_ASSERT(ret == 0);
- }
- #endif
- fclose(f);
- }
- static void write_bus_affinity_file_content(void)
- {
- FILE *f;
- STARPU_ASSERT(was_benchmarked);
- char path[256];
- get_affinity_path(path, 256);
- f = fopen(path, "w+");
- if (!f)
- {
- perror("fopen");
- STARPU_ABORT();
- }
- #ifdef USE_CUDA
- struct machine_config_s *config = _starpu_get_machine_config();
- unsigned ncores = _starpu_topology_get_nhwcore(config);
- int gpu;
- for (gpu = 0; gpu < ncuda; gpu++)
- {
- fprintf(f, "%d\t", gpu);
- unsigned core;
- for (core = 0; core < ncores; core++)
- {
- fprintf(f, "%d\t", cudadev_timing_per_cpu[gpu+1][core].cpu_id);
- }
- fprintf(f, "\n");
- }
- #endif
- fclose(f);
- }
- static void generate_bus_affinity_file(void)
- {
- if (!was_benchmarked)
- benchmark_all_cuda_devices();
- write_bus_affinity_file_content();
- }
- static void load_bus_affinity_file(void)
- {
- int res;
- char path[256];
- get_affinity_path(path, 256);
- res = access(path, F_OK);
- if (res)
- {
- /* File does not exist yet */
- generate_bus_affinity_file();
- }
- load_bus_affinity_file_content();
- }
- int *get_gpu_affinity_vector(unsigned gpuid)
- {
- return affinity_matrix[gpuid];
- }
- /*
- * Latency
- */
- static void get_latency_path(char *path, size_t maxlen)
- {
- get_bus_path("latency", path, maxlen);
- }
- static void load_bus_latency_file_content(void)
- {
- int n;
- unsigned src, dst;
- FILE *f;
- char path[256];
- get_latency_path(path, 256);
- f = fopen(path, "r");
- STARPU_ASSERT(f);
- for (src = 0; src < STARPU_MAXNODES; src++)
- {
- for (dst = 0; dst < STARPU_MAXNODES; dst++)
- {
- double latency;
- n = fscanf(f, "%lf ", &latency);
- STARPU_ASSERT(n == 1);
- latency_matrix[src][dst] = latency;
- }
- n = fscanf(f, "\n");
- STARPU_ASSERT(n == 0);
- }
- fclose(f);
- }
- static void write_bus_latency_file_content(void)
- {
- int src, dst;
- FILE *f;
- STARPU_ASSERT(was_benchmarked);
- char path[256];
- get_latency_path(path, 256);
- f = fopen(path, "w+");
- if (!f)
- {
- perror("fopen");
- STARPU_ABORT();
- }
- for (src = 0; src < STARPU_MAXNODES; src++)
- {
- for (dst = 0; dst < STARPU_MAXNODES; dst++)
- {
- double latency;
- if ((src > ncuda) || (dst > ncuda))
- {
- /* convention */
- latency = -1.0;
- }
- else if (src == dst)
- {
- latency = 0.0;
- }
- else {
- latency = ((src && dst)?2000.0:500.0);
- }
- fprintf(f, "%lf ", latency);
- }
- fprintf(f, "\n");
- }
- fclose(f);
- }
- static void generate_bus_latency_file(void)
- {
- if (!was_benchmarked)
- benchmark_all_cuda_devices();
- write_bus_latency_file_content();
- }
- static void load_bus_latency_file(void)
- {
- int res;
- char path[256];
- get_latency_path(path, 256);
- res = access(path, F_OK);
- if (res)
- {
- /* File does not exist yet */
- generate_bus_latency_file();
- }
- load_bus_latency_file_content();
- }
- /*
- * Bandwith
- */
- static void get_bandwith_path(char *path, size_t maxlen)
- {
- get_bus_path("bandwith", path, maxlen);
- }
- static void load_bus_bandwith_file_content(void)
- {
- int n;
- unsigned src, dst;
- FILE *f;
- char path[256];
- get_bandwith_path(path, 256);
- f = fopen(path, "r");
- if (!f)
- {
- perror("fopen");
- STARPU_ABORT();
- }
- for (src = 0; src < STARPU_MAXNODES; src++)
- {
- for (dst = 0; dst < STARPU_MAXNODES; dst++)
- {
- double bandwith;
- n = fscanf(f, "%lf ", &bandwith);
- STARPU_ASSERT(n == 1);
- bandwith_matrix[src][dst] = bandwith;
- }
- n = fscanf(f, "\n");
- STARPU_ASSERT(n == 0);
- }
- fclose(f);
- }
- static void write_bus_bandwith_file_content(void)
- {
- int src, dst;
- FILE *f;
- STARPU_ASSERT(was_benchmarked);
- char path[256];
- get_bandwith_path(path, 256);
- f = fopen(path, "w+");
- STARPU_ASSERT(f);
- for (src = 0; src < STARPU_MAXNODES; src++)
- {
- for (dst = 0; dst < STARPU_MAXNODES; dst++)
- {
- double bandwith;
-
- if ((src > ncuda) || (dst > ncuda))
- {
- bandwith = -1.0;
- }
- #ifdef USE_CUDA
- else if (src != dst)
- {
- /* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
- double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
- double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
-
- double timing =time_src_to_ram + time_ram_to_dst;
-
- bandwith = 1.0*SIZE/timing;
- }
- #endif
- else {
- /* convention */
- bandwith = 0.0;
- }
-
- fprintf(f, "%lf ", bandwith);
- }
- fprintf(f, "\n");
- }
- fclose(f);
- }
- static void generate_bus_bandwith_file(void)
- {
- if (!was_benchmarked)
- benchmark_all_cuda_devices();
- write_bus_bandwith_file_content();
- }
- static void load_bus_bandwith_file(void)
- {
- int res;
- char path[256];
- get_bandwith_path(path, 256);
- res = access(path, F_OK);
- if (res)
- {
- /* File does not exist yet */
- generate_bus_bandwith_file();
- }
- load_bus_bandwith_file_content();
- }
- /*
- * Generic
- */
- void starpu_force_bus_sampling(void)
- {
- create_sampling_directory_if_needed();
- generate_bus_affinity_file();
- generate_bus_latency_file();
- generate_bus_bandwith_file();
- }
- void load_bus_performance_files(void)
- {
- create_sampling_directory_if_needed();
- load_bus_affinity_file();
- load_bus_latency_file();
- load_bus_bandwith_file();
- }
- double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
- {
- double bandwith = bandwith_matrix[src_node][dst_node];
- double latency = latency_matrix[src_node][dst_node];
- return latency + size/bandwith;
- }
|