|
@@ -0,0 +1,354 @@
|
|
|
+/*
|
|
|
+ * StarPU
|
|
|
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
|
|
|
+ *
|
|
|
+ * This program is free software; you can redistribute it and/or modify
|
|
|
+ * it under the terms of the GNU Lesser General Public License as published by
|
|
|
+ * the Free Software Foundation; either version 2.1 of the License, or (at
|
|
|
+ * your option) any later version.
|
|
|
+ *
|
|
|
+ * This program is distributed in the hope that it will be useful, but
|
|
|
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
+ *
|
|
|
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
|
|
|
+ */
|
|
|
+
|
|
|
+#include <unistd.h>
|
|
|
+#include <sys/time.h>
|
|
|
+
|
|
|
+#include <starpu.h>
|
|
|
+#include <common/config.h>
|
|
|
+#include <core/perfmodel/perfmodel.h>
|
|
|
+#include <datawizard/data_parameters.h>
|
|
|
+
|
|
|
+
|
|
|
+#define SIZE (32*1024*1024*sizeof(char))
|
|
|
+#define NITER 128
|
|
|
+
|
|
|
+static double bandwith_matrix[MAXNODES][MAXNODES] = {{-1.0}};
|
|
|
+static double latency_matrix[MAXNODES][MAXNODES] = {{ -1.0}};
|
|
|
+
|
|
|
+/* Benchmarking the performance of the bus */
|
|
|
+
|
|
|
+static double cudadev_timing_htod[MAXNODES] = {0.0};
|
|
|
+static double cudadev_timing_dtoh[MAXNODES] = {0.0};
|
|
|
+static int ncuda;
|
|
|
+static unsigned was_benchmarked = 0;
|
|
|
+
|
|
|
+static void measure_bandwith_between_host_and_dev(int dev)
|
|
|
+{
|
|
|
+ /* Initiliaze CUDA context on the device */
|
|
|
+ cudaSetDevice(dev);
|
|
|
+
|
|
|
+ /* hack to force the initialization */
|
|
|
+ cudaFree(0);
|
|
|
+
|
|
|
+ /* Allocate a buffer on the device */
|
|
|
+ unsigned char *d_buffer;
|
|
|
+ cudaMalloc((void **)&d_buffer, SIZE);
|
|
|
+ assert(d_buffer);
|
|
|
+
|
|
|
+ /* Allocate a buffer on the host */
|
|
|
+ unsigned char *h_buffer;
|
|
|
+ cudaHostAlloc((void **)&h_buffer, SIZE, 0);
|
|
|
+ assert(h_buffer);
|
|
|
+
|
|
|
+ /* Fill them */
|
|
|
+ memset(h_buffer, 0, SIZE);
|
|
|
+ cudaMemset(d_buffer, 0, SIZE);
|
|
|
+
|
|
|
+ unsigned iter;
|
|
|
+ double timing;
|
|
|
+ struct timeval start;
|
|
|
+ struct timeval end;
|
|
|
+
|
|
|
+ /* Measure upload bandwith */
|
|
|
+ gettimeofday(&start, NULL);
|
|
|
+ for (iter = 0; iter < NITER; iter++)
|
|
|
+ {
|
|
|
+ cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
|
|
|
+ cudaThreadSynchronize();
|
|
|
+ }
|
|
|
+ gettimeofday(&end, NULL);
|
|
|
+ timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
+
|
|
|
+ cudadev_timing_htod[dev+1] = timing/NITER;
|
|
|
+
|
|
|
+ /* Measure download bandwith */
|
|
|
+ gettimeofday(&start, NULL);
|
|
|
+ for (iter = 0; iter < NITER; iter++)
|
|
|
+ {
|
|
|
+ cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
|
|
|
+ cudaThreadSynchronize();
|
|
|
+ }
|
|
|
+ gettimeofday(&end, NULL);
|
|
|
+ timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
+
|
|
|
+ cudadev_timing_dtoh[dev+1] = timing/NITER;
|
|
|
+
|
|
|
+ /* Free buffers */
|
|
|
+ cudaFreeHost(h_buffer);
|
|
|
+ cudaFree(d_buffer);
|
|
|
+
|
|
|
+ cudaThreadExit();
|
|
|
+}
|
|
|
+
|
|
|
+static void benchmark_all_cuda_devices(void)
|
|
|
+{
|
|
|
+#ifdef VERBOSE
|
|
|
+ fprintf(stderr, "Benchmarking the speed of the bus\n");
|
|
|
+#endif
|
|
|
+
|
|
|
+ cudaGetDeviceCount(&ncuda);
|
|
|
+
|
|
|
+ int i;
|
|
|
+ for (i = 0; i < ncuda; i++)
|
|
|
+ {
|
|
|
+ /* measure bandwith between Host and Device i */
|
|
|
+ measure_bandwith_between_host_and_dev(i);
|
|
|
+ }
|
|
|
+
|
|
|
+ was_benchmarked = 1;
|
|
|
+
|
|
|
+#ifdef VERBOSE
|
|
|
+ fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
|
|
|
+#endif
|
|
|
+}
|
|
|
+
|
|
|
+static void get_bus_path(const char *type, char *path, size_t maxlen)
|
|
|
+{
|
|
|
+ strncpy(path, PERF_MODEL_DIR_BUS, maxlen);
|
|
|
+ strncat(path, type, maxlen);
|
|
|
+
|
|
|
+ char hostname[32];
|
|
|
+ gethostname(hostname, 32);
|
|
|
+ strncat(path, ".", maxlen);
|
|
|
+ strncat(path, hostname, maxlen);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Latency
|
|
|
+ */
|
|
|
+
|
|
|
+static void get_latency_path(char *path, size_t maxlen)
|
|
|
+{
|
|
|
+ get_bus_path("latency", path, maxlen);
|
|
|
+}
|
|
|
+
|
|
|
+static void load_bus_latency_file_content(void)
|
|
|
+{
|
|
|
+ int n;
|
|
|
+ unsigned src, dst;
|
|
|
+ FILE *f;
|
|
|
+
|
|
|
+ char path[256];
|
|
|
+ get_latency_path(path, 256);
|
|
|
+
|
|
|
+ f = fopen(path, "r");
|
|
|
+ STARPU_ASSERT(f);
|
|
|
+
|
|
|
+ for (src = 0; src < MAXNODES; src++)
|
|
|
+ {
|
|
|
+ for (dst = 0; dst < MAXNODES; dst++)
|
|
|
+ {
|
|
|
+ double latency;
|
|
|
+
|
|
|
+ n = fscanf(f, "%lf ", &latency);
|
|
|
+ STARPU_ASSERT(n == 1);
|
|
|
+
|
|
|
+ latency_matrix[src][dst] = latency;
|
|
|
+ }
|
|
|
+
|
|
|
+ n = fscanf(f, "\n");
|
|
|
+ STARPU_ASSERT(n == 0);
|
|
|
+ }
|
|
|
+
|
|
|
+ fclose(f);
|
|
|
+}
|
|
|
+
|
|
|
+static void write_bus_latency_file_content(void)
|
|
|
+{
|
|
|
+ int src, dst;
|
|
|
+ FILE *f;
|
|
|
+
|
|
|
+ STARPU_ASSERT(was_benchmarked);
|
|
|
+
|
|
|
+ char path[256];
|
|
|
+ get_latency_path(path, 256);
|
|
|
+
|
|
|
+ f = fopen(path, "w+");
|
|
|
+ STARPU_ASSERT(f);
|
|
|
+
|
|
|
+ for (src = 0; src < MAXNODES; src++)
|
|
|
+ {
|
|
|
+ for (dst = 0; dst < MAXNODES; dst++)
|
|
|
+ {
|
|
|
+ double latency;
|
|
|
+
|
|
|
+ if ((src > ncuda) || (dst > ncuda))
|
|
|
+ {
|
|
|
+ /* convention */
|
|
|
+ latency = -1.0;
|
|
|
+ }
|
|
|
+ else if (src == dst)
|
|
|
+ {
|
|
|
+ latency = 0.0;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ latency = ((src && dst)?2000.0:500.0);
|
|
|
+ }
|
|
|
+
|
|
|
+ fprintf(f, "%lf ", latency);
|
|
|
+ }
|
|
|
+
|
|
|
+ fprintf(f, "\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ fclose(f);
|
|
|
+}
|
|
|
+
|
|
|
+static void load_bus_latency_file(void)
|
|
|
+{
|
|
|
+ int res;
|
|
|
+
|
|
|
+ char path[256];
|
|
|
+ get_latency_path(path, 256);
|
|
|
+
|
|
|
+ res = access(path, F_OK);
|
|
|
+ if (res)
|
|
|
+ {
|
|
|
+ /* File does not exist yet */
|
|
|
+ if (!was_benchmarked)
|
|
|
+ benchmark_all_cuda_devices();
|
|
|
+
|
|
|
+ write_bus_latency_file_content();
|
|
|
+ }
|
|
|
+
|
|
|
+ load_bus_latency_file_content();
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+/*
|
|
|
+ * Bandwith
|
|
|
+ */
|
|
|
+static void get_bandwith_path(char *path, size_t maxlen)
|
|
|
+{
|
|
|
+ get_bus_path("bandwith", path, maxlen);
|
|
|
+}
|
|
|
+
|
|
|
+static void load_bus_bandwith_file_content(void)
|
|
|
+{
|
|
|
+ int n;
|
|
|
+ unsigned src, dst;
|
|
|
+ FILE *f;
|
|
|
+
|
|
|
+ char path[256];
|
|
|
+ get_bandwith_path(path, 256);
|
|
|
+
|
|
|
+ f = fopen(path, "r");
|
|
|
+ STARPU_ASSERT(f);
|
|
|
+
|
|
|
+ for (src = 0; src < MAXNODES; src++)
|
|
|
+ {
|
|
|
+ for (dst = 0; dst < MAXNODES; dst++)
|
|
|
+ {
|
|
|
+ double bandwith;
|
|
|
+
|
|
|
+ n = fscanf(f, "%lf ", &bandwith);
|
|
|
+ STARPU_ASSERT(n == 1);
|
|
|
+
|
|
|
+ bandwith_matrix[src][dst] = bandwith;
|
|
|
+ }
|
|
|
+
|
|
|
+ n = fscanf(f, "\n");
|
|
|
+ STARPU_ASSERT(n == 0);
|
|
|
+ }
|
|
|
+
|
|
|
+ fclose(f);
|
|
|
+}
|
|
|
+
|
|
|
+static void write_bus_bandwith_file_content(void)
|
|
|
+{
|
|
|
+ int src, dst;
|
|
|
+ FILE *f;
|
|
|
+
|
|
|
+ STARPU_ASSERT(was_benchmarked);
|
|
|
+
|
|
|
+ char path[256];
|
|
|
+ get_bandwith_path(path, 256);
|
|
|
+
|
|
|
+ f = fopen(path, "w+");
|
|
|
+ STARPU_ASSERT(f);
|
|
|
+
|
|
|
+ for (src = 0; src < MAXNODES; src++)
|
|
|
+ {
|
|
|
+ for (dst = 0; dst < MAXNODES; dst++)
|
|
|
+ {
|
|
|
+ double bandwith;
|
|
|
+
|
|
|
+ if ((src > ncuda) || (dst > ncuda))
|
|
|
+ {
|
|
|
+ bandwith = -1.0;
|
|
|
+ }
|
|
|
+ else if (src != dst)
|
|
|
+ {
|
|
|
+ /* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
|
|
|
+ double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
|
|
|
+ double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
|
|
|
+
|
|
|
+ double timing =time_src_to_ram + time_ram_to_dst;
|
|
|
+
|
|
|
+ bandwith = 1.0*SIZE/timing;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ /* convention */
|
|
|
+ bandwith = 0.0;
|
|
|
+ }
|
|
|
+
|
|
|
+ fprintf(f, "%lf ", bandwith);
|
|
|
+ }
|
|
|
+
|
|
|
+ fprintf(f, "\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ fclose(f);
|
|
|
+}
|
|
|
+
|
|
|
+static void load_bus_bandwith_file(void)
|
|
|
+{
|
|
|
+ int res;
|
|
|
+
|
|
|
+ char path[256];
|
|
|
+ get_bandwith_path(path, 256);
|
|
|
+
|
|
|
+ res = access(path, F_OK);
|
|
|
+ if (res)
|
|
|
+ {
|
|
|
+ /* File does not exist yet */
|
|
|
+ if (!was_benchmarked)
|
|
|
+ benchmark_all_cuda_devices();
|
|
|
+
|
|
|
+ write_bus_bandwith_file_content();
|
|
|
+ }
|
|
|
+
|
|
|
+ load_bus_bandwith_file_content();
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Generic
|
|
|
+ */
|
|
|
+
|
|
|
+void load_bus_performance_files(void)
|
|
|
+{
|
|
|
+ load_bus_latency_file();
|
|
|
+ load_bus_bandwith_file();
|
|
|
+}
|
|
|
+
|
|
|
+double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
|
|
|
+{
|
|
|
+ double bandwith = bandwith_matrix[src_node][dst_node];
|
|
|
+ double latency = latency_matrix[src_node][dst_node];
|
|
|
+
|
|
|
+ return latency + size/bandwith;
|
|
|
+}
|