/* * StarPU * Copyright (C) INRIA 2008-2009 (see AUTHORS file) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include #include #include #include #include #include #define SIZE (32*1024*1024*sizeof(char)) #define NITER 128 static double bandwith_matrix[MAXNODES][MAXNODES] = {{-1.0}}; static double latency_matrix[MAXNODES][MAXNODES] = {{ -1.0}}; /* Benchmarking the performance of the bus */ static double cudadev_timing_htod[MAXNODES] = {0.0}; static double cudadev_timing_dtoh[MAXNODES] = {0.0}; static int ncuda; static unsigned was_benchmarked = 0; static void measure_bandwith_between_host_and_dev(int dev) { /* Initiliaze CUDA context on the device */ cudaSetDevice(dev); /* hack to force the initialization */ cudaFree(0); /* Allocate a buffer on the device */ unsigned char *d_buffer; cudaMalloc((void **)&d_buffer, SIZE); assert(d_buffer); /* Allocate a buffer on the host */ unsigned char *h_buffer; cudaHostAlloc((void **)&h_buffer, SIZE, 0); assert(h_buffer); /* Fill them */ memset(h_buffer, 0, SIZE); cudaMemset(d_buffer, 0, SIZE); unsigned iter; double timing; struct timeval start; struct timeval end; /* Measure upload bandwith */ gettimeofday(&start, NULL); for (iter = 0; iter < NITER; iter++) { cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice); cudaThreadSynchronize(); } gettimeofday(&end, NULL); timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec)); cudadev_timing_htod[dev+1] = timing/NITER; /* Measure download bandwith */ gettimeofday(&start, NULL); for (iter = 0; iter < NITER; iter++) { cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost); cudaThreadSynchronize(); } gettimeofday(&end, NULL); timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec)); cudadev_timing_dtoh[dev+1] = timing/NITER; /* Free buffers */ cudaFreeHost(h_buffer); cudaFree(d_buffer); cudaThreadExit(); } static void benchmark_all_cuda_devices(void) { #ifdef VERBOSE fprintf(stderr, "Benchmarking the speed of the bus\n"); #endif cudaGetDeviceCount(&ncuda); int i; for (i = 0; i < ncuda; i++) { /* measure bandwith between Host and Device i */ measure_bandwith_between_host_and_dev(i); } was_benchmarked = 1; #ifdef VERBOSE fprintf(stderr, "Benchmarking the speed of the bus is done.\n"); #endif } static void get_bus_path(const char *type, char *path, size_t maxlen) { strncpy(path, PERF_MODEL_DIR_BUS, maxlen); strncat(path, type, maxlen); char hostname[32]; gethostname(hostname, 32); strncat(path, ".", maxlen); strncat(path, hostname, maxlen); } /* * Latency */ static void get_latency_path(char *path, size_t maxlen) { get_bus_path("latency", path, maxlen); } static void load_bus_latency_file_content(void) { int n; unsigned src, dst; FILE *f; char path[256]; get_latency_path(path, 256); f = fopen(path, "r"); STARPU_ASSERT(f); for (src = 0; src < MAXNODES; src++) { for (dst = 0; dst < MAXNODES; dst++) { double latency; n = fscanf(f, "%lf ", &latency); STARPU_ASSERT(n == 1); latency_matrix[src][dst] = latency; } n = fscanf(f, "\n"); STARPU_ASSERT(n == 0); } fclose(f); } static void write_bus_latency_file_content(void) { int src, dst; FILE *f; STARPU_ASSERT(was_benchmarked); char path[256]; get_latency_path(path, 256); f = fopen(path, "w+"); STARPU_ASSERT(f); for (src = 0; src < MAXNODES; src++) { for (dst = 0; dst < MAXNODES; dst++) { double latency; if ((src > ncuda) || (dst > ncuda)) { /* convention */ latency = -1.0; } else if (src == dst) { latency = 0.0; } else { latency = ((src && dst)?2000.0:500.0); } fprintf(f, "%lf ", latency); } fprintf(f, "\n"); } fclose(f); } static void load_bus_latency_file(void) { int res; char path[256]; get_latency_path(path, 256); res = access(path, F_OK); if (res) { /* File does not exist yet */ if (!was_benchmarked) benchmark_all_cuda_devices(); write_bus_latency_file_content(); } load_bus_latency_file_content(); } /* * Bandwith */ static void get_bandwith_path(char *path, size_t maxlen) { get_bus_path("bandwith", path, maxlen); } static void load_bus_bandwith_file_content(void) { int n; unsigned src, dst; FILE *f; char path[256]; get_bandwith_path(path, 256); f = fopen(path, "r"); STARPU_ASSERT(f); for (src = 0; src < MAXNODES; src++) { for (dst = 0; dst < MAXNODES; dst++) { double bandwith; n = fscanf(f, "%lf ", &bandwith); STARPU_ASSERT(n == 1); bandwith_matrix[src][dst] = bandwith; } n = fscanf(f, "\n"); STARPU_ASSERT(n == 0); } fclose(f); } static void write_bus_bandwith_file_content(void) { int src, dst; FILE *f; STARPU_ASSERT(was_benchmarked); char path[256]; get_bandwith_path(path, 256); f = fopen(path, "w+"); STARPU_ASSERT(f); for (src = 0; src < MAXNODES; src++) { for (dst = 0; dst < MAXNODES; dst++) { double bandwith; if ((src > ncuda) || (dst > ncuda)) { bandwith = -1.0; } else if (src != dst) { /* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/ double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src]; double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst]; double timing =time_src_to_ram + time_ram_to_dst; bandwith = 1.0*SIZE/timing; } else { /* convention */ bandwith = 0.0; } fprintf(f, "%lf ", bandwith); } fprintf(f, "\n"); } fclose(f); } static void load_bus_bandwith_file(void) { int res; char path[256]; get_bandwith_path(path, 256); res = access(path, F_OK); if (res) { /* File does not exist yet */ if (!was_benchmarked) benchmark_all_cuda_devices(); write_bus_bandwith_file_content(); } load_bus_bandwith_file_content(); } /* * Generic */ void load_bus_performance_files(void) { load_bus_latency_file(); load_bus_bandwith_file(); } double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size) { double bandwith = bandwith_matrix[src_node][dst_node]; double latency = latency_matrix[src_node][dst_node]; return latency + size/bandwith; }