123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355 |
- /*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
- #include <unistd.h>
- #include <sys/time.h>
- #include <starpu.h>
- #include <common/config.h>
- #include <core/perfmodel/perfmodel.h>
- #include <datawizard/data_parameters.h>
- #define SIZE (32*1024*1024*sizeof(char))
- #define NITER 128
- static double bandwith_matrix[MAXNODES][MAXNODES] = {{-1.0}};
- static double latency_matrix[MAXNODES][MAXNODES] = {{ -1.0}};
- /* Benchmarking the performance of the bus */
- static double cudadev_timing_htod[MAXNODES] = {0.0};
- static double cudadev_timing_dtoh[MAXNODES] = {0.0};
- static int ncuda;
- static unsigned was_benchmarked = 0;
- static void measure_bandwith_between_host_and_dev(int dev)
- {
- /* Initiliaze CUDA context on the device */
- cudaSetDevice(dev);
- /* hack to force the initialization */
- cudaFree(0);
- /* Allocate a buffer on the device */
- unsigned char *d_buffer;
- cudaMalloc((void **)&d_buffer, SIZE);
- assert(d_buffer);
- /* Allocate a buffer on the host */
- unsigned char *h_buffer;
- cudaHostAlloc((void **)&h_buffer, SIZE, 0);
- assert(h_buffer);
- /* Fill them */
- memset(h_buffer, 0, SIZE);
- cudaMemset(d_buffer, 0, SIZE);
- unsigned iter;
- double timing;
- struct timeval start;
- struct timeval end;
- /* Measure upload bandwith */
- gettimeofday(&start, NULL);
- for (iter = 0; iter < NITER; iter++)
- {
- cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
- cudaThreadSynchronize();
- }
- gettimeofday(&end, NULL);
- timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
- cudadev_timing_htod[dev+1] = timing/NITER;
- /* Measure download bandwith */
- gettimeofday(&start, NULL);
- for (iter = 0; iter < NITER; iter++)
- {
- cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
- cudaThreadSynchronize();
- }
- gettimeofday(&end, NULL);
- timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
- cudadev_timing_dtoh[dev+1] = timing/NITER;
- /* Free buffers */
- cudaFreeHost(h_buffer);
- cudaFree(d_buffer);
- cudaThreadExit();
- }
- static void benchmark_all_cuda_devices(void)
- {
- #ifdef VERBOSE
- fprintf(stderr, "Benchmarking the speed of the bus\n");
- #endif
- cudaGetDeviceCount(&ncuda);
- int i;
- for (i = 0; i < ncuda; i++)
- {
- /* measure bandwith between Host and Device i */
- measure_bandwith_between_host_and_dev(i);
- }
- was_benchmarked = 1;
- #ifdef VERBOSE
- fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
- #endif
- }
- static void get_bus_path(const char *type, char *path, size_t maxlen)
- {
- strncpy(path, PERF_MODEL_DIR_BUS, maxlen);
- strncat(path, type, maxlen);
-
- char hostname[32];
- gethostname(hostname, 32);
- strncat(path, ".", maxlen);
- strncat(path, hostname, maxlen);
- }
- /*
- * Latency
- */
- static void get_latency_path(char *path, size_t maxlen)
- {
- get_bus_path("latency", path, maxlen);
- }
- static void load_bus_latency_file_content(void)
- {
- int n;
- unsigned src, dst;
- FILE *f;
- char path[256];
- get_latency_path(path, 256);
- f = fopen(path, "r");
- STARPU_ASSERT(f);
- for (src = 0; src < MAXNODES; src++)
- {
- for (dst = 0; dst < MAXNODES; dst++)
- {
- double latency;
- n = fscanf(f, "%lf ", &latency);
- STARPU_ASSERT(n == 1);
- latency_matrix[src][dst] = latency;
- }
- n = fscanf(f, "\n");
- STARPU_ASSERT(n == 0);
- }
- fclose(f);
- }
- static void write_bus_latency_file_content(void)
- {
- int src, dst;
- FILE *f;
- STARPU_ASSERT(was_benchmarked);
- char path[256];
- get_latency_path(path, 256);
- f = fopen(path, "w+");
- STARPU_ASSERT(f);
- for (src = 0; src < MAXNODES; src++)
- {
- for (dst = 0; dst < MAXNODES; dst++)
- {
- double latency;
- if ((src > ncuda) || (dst > ncuda))
- {
- /* convention */
- latency = -1.0;
- }
- else if (src == dst)
- {
- latency = 0.0;
- }
- else {
- latency = ((src && dst)?2000.0:500.0);
- }
- fprintf(f, "%lf ", latency);
- }
- fprintf(f, "\n");
- }
- fclose(f);
- }
- static void load_bus_latency_file(void)
- {
- int res;
- char path[256];
- get_latency_path(path, 256);
- res = access(path, F_OK);
- if (res)
- {
- /* File does not exist yet */
- if (!was_benchmarked)
- benchmark_all_cuda_devices();
- write_bus_latency_file_content();
- }
- load_bus_latency_file_content();
- }
- /*
- * Bandwith
- */
- static void get_bandwith_path(char *path, size_t maxlen)
- {
- get_bus_path("bandwith", path, maxlen);
- }
- static void load_bus_bandwith_file_content(void)
- {
- int n;
- unsigned src, dst;
- FILE *f;
- char path[256];
- get_bandwith_path(path, 256);
- f = fopen(path, "r");
- STARPU_ASSERT(f);
- for (src = 0; src < MAXNODES; src++)
- {
- for (dst = 0; dst < MAXNODES; dst++)
- {
- double bandwith;
- n = fscanf(f, "%lf ", &bandwith);
- STARPU_ASSERT(n == 1);
- bandwith_matrix[src][dst] = bandwith;
- }
- n = fscanf(f, "\n");
- STARPU_ASSERT(n == 0);
- }
- fclose(f);
- }
- static void write_bus_bandwith_file_content(void)
- {
- int src, dst;
- FILE *f;
- STARPU_ASSERT(was_benchmarked);
- char path[256];
- get_bandwith_path(path, 256);
- f = fopen(path, "w+");
- STARPU_ASSERT(f);
- for (src = 0; src < MAXNODES; src++)
- {
- for (dst = 0; dst < MAXNODES; dst++)
- {
- double bandwith;
-
- if ((src > ncuda) || (dst > ncuda))
- {
- bandwith = -1.0;
- }
- else if (src != dst)
- {
- /* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
- double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
- double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
-
- double timing =time_src_to_ram + time_ram_to_dst;
-
- bandwith = 1.0*SIZE/timing;
- }
- else {
- /* convention */
- bandwith = 0.0;
- }
-
- fprintf(f, "%lf ", bandwith);
- }
- fprintf(f, "\n");
- }
- fclose(f);
- }
- static void load_bus_bandwith_file(void)
- {
- int res;
- char path[256];
- get_bandwith_path(path, 256);
- res = access(path, F_OK);
- if (res)
- {
- /* File does not exist yet */
- if (!was_benchmarked)
- benchmark_all_cuda_devices();
- write_bus_bandwith_file_content();
- }
- load_bus_bandwith_file_content();
- }
- /*
- * Generic
- */
- void load_bus_performance_files(void)
- {
- load_bus_latency_file();
- load_bus_bandwith_file();
- }
- double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
- {
- double bandwith = bandwith_matrix[src_node][dst_node];
- double latency = latency_matrix[src_node][dst_node];
- return latency + size/bandwith;
- }
|