|
@@ -1,7 +1,7 @@
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
*
|
|
*
|
|
* Copyright (C) 2009, 2010-2011 Université de Bordeaux 1
|
|
* Copyright (C) 2009, 2010-2011 Université de Bordeaux 1
|
|
- * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
|
|
|
|
|
|
+ * Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
|
|
*
|
|
*
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
@@ -46,14 +46,16 @@
|
|
|
|
|
|
#define MAXCPUS 32
|
|
#define MAXCPUS 32
|
|
|
|
|
|
-struct dev_timing {
|
|
|
|
|
|
+/* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
|
|
|
|
+struct dev_timing
|
|
|
|
+{
|
|
int cpu_id;
|
|
int cpu_id;
|
|
double timing_htod;
|
|
double timing_htod;
|
|
double timing_dtoh;
|
|
double timing_dtoh;
|
|
};
|
|
};
|
|
|
|
|
|
-static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
|
|
|
|
-static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
|
|
|
|
|
|
+static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{NAN}};
|
|
|
|
+static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{NAN}};
|
|
static unsigned was_benchmarked = 0;
|
|
static unsigned was_benchmarked = 0;
|
|
static unsigned ncpus = 0;
|
|
static unsigned ncpus = 0;
|
|
static int ncuda = 0;
|
|
static int ncuda = 0;
|
|
@@ -65,15 +67,16 @@ static int nopencl = 0;
|
|
static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
|
|
static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
|
|
static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
|
|
static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
|
|
static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
|
|
static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
|
|
|
|
+#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
|
+static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
|
|
|
|
+#endif
|
|
static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
|
|
static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
|
|
-static size_t cuda_size = SIZE;
|
|
|
|
#endif
|
|
#endif
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
|
|
static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
|
|
static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
|
|
static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
|
|
static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
|
|
static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
|
|
static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
|
|
static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
|
|
-static size_t opencl_size = SIZE;
|
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
@@ -85,8 +88,9 @@ static hwloc_topology_t hwtopology;
|
|
#ifdef STARPU_USE_CUDA
|
|
#ifdef STARPU_USE_CUDA
|
|
static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
|
|
static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
|
|
{
|
|
{
|
|
- struct starpu_machine_config_s *config = _starpu_get_machine_config();
|
|
|
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
|
|
+ size_t size = SIZE;
|
|
|
|
|
|
/* Initiliaze CUDA context on the device */
|
|
/* Initiliaze CUDA context on the device */
|
|
cudaSetDevice(dev);
|
|
cudaSetDevice(dev);
|
|
@@ -105,34 +109,31 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
|
|
cudaError_t cures;
|
|
cudaError_t cures;
|
|
cures = cudaGetDeviceProperties(&prop, dev);
|
|
cures = cudaGetDeviceProperties(&prop, dev);
|
|
if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
|
|
if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
|
|
- if (cuda_size > prop.totalGlobalMem/4) cuda_size = prop.totalGlobalMem/4;
|
|
|
|
|
|
+ if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
|
|
|
|
|
|
/* Allocate a buffer on the device */
|
|
/* Allocate a buffer on the device */
|
|
unsigned char *d_buffer;
|
|
unsigned char *d_buffer;
|
|
- cudaMalloc((void **)&d_buffer, cuda_size);
|
|
|
|
- assert(d_buffer);
|
|
|
|
|
|
+ cudaMalloc((void **)&d_buffer, size);
|
|
|
|
+ STARPU_ASSERT(d_buffer);
|
|
|
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
|
|
|
|
-
|
|
|
|
/* Allocate a buffer on the host */
|
|
/* Allocate a buffer on the host */
|
|
unsigned char *h_buffer;
|
|
unsigned char *h_buffer;
|
|
- cudaHostAlloc((void **)&h_buffer, cuda_size, 0);
|
|
|
|
- assert(h_buffer);
|
|
|
|
|
|
+ cures = cudaHostAlloc((void **)&h_buffer, size, 0);
|
|
|
|
+ STARPU_ASSERT(cures == cudaSuccess);
|
|
|
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
|
|
|
|
-
|
|
|
|
/* Fill them */
|
|
/* Fill them */
|
|
- memset(h_buffer, 0, cuda_size);
|
|
|
|
- cudaMemset(d_buffer, 0, cuda_size);
|
|
|
|
|
|
+ memset(h_buffer, 0, size);
|
|
|
|
+ cudaMemset(d_buffer, 0, size);
|
|
|
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
|
|
|
|
-
|
|
|
|
unsigned iter;
|
|
unsigned iter;
|
|
double timing;
|
|
double timing;
|
|
struct timeval start;
|
|
struct timeval start;
|
|
@@ -142,25 +143,25 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
|
|
gettimeofday(&start, NULL);
|
|
gettimeofday(&start, NULL);
|
|
for (iter = 0; iter < NITER; iter++)
|
|
for (iter = 0; iter < NITER; iter++)
|
|
{
|
|
{
|
|
- cudaMemcpy(d_buffer, h_buffer, cuda_size, cudaMemcpyHostToDevice);
|
|
|
|
|
|
+ cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
|
|
cudaThreadSynchronize();
|
|
cudaThreadSynchronize();
|
|
}
|
|
}
|
|
gettimeofday(&end, NULL);
|
|
gettimeofday(&end, NULL);
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
|
|
|
|
|
|
+ dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
|
|
|
|
|
|
/* Measure download bandwidth */
|
|
/* Measure download bandwidth */
|
|
gettimeofday(&start, NULL);
|
|
gettimeofday(&start, NULL);
|
|
for (iter = 0; iter < NITER; iter++)
|
|
for (iter = 0; iter < NITER; iter++)
|
|
{
|
|
{
|
|
- cudaMemcpy(h_buffer, d_buffer, cuda_size, cudaMemcpyDeviceToHost);
|
|
|
|
|
|
+ cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
|
|
cudaThreadSynchronize();
|
|
cudaThreadSynchronize();
|
|
}
|
|
}
|
|
gettimeofday(&end, NULL);
|
|
gettimeofday(&end, NULL);
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
|
|
|
|
|
|
+ dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
|
|
|
|
|
|
/* Free buffers */
|
|
/* Free buffers */
|
|
cudaFreeHost(h_buffer);
|
|
cudaFreeHost(h_buffer);
|
|
@@ -168,6 +169,65 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
|
|
|
|
|
|
cudaThreadExit();
|
|
cudaThreadExit();
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
|
+static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
|
|
|
|
+{
|
|
|
|
+ size_t size = SIZE;
|
|
|
|
+
|
|
|
|
+ /* Get the maximum size which can be allocated on the device */
|
|
|
|
+ struct cudaDeviceProp prop;
|
|
|
|
+ cudaError_t cures;
|
|
|
|
+ cures = cudaGetDeviceProperties(&prop, src);
|
|
|
|
+ if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
|
+ if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
|
|
|
|
+ cures = cudaGetDeviceProperties(&prop, dst);
|
|
|
|
+ if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
|
+ if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
|
|
|
|
+
|
|
|
|
+ /* Initiliaze CUDA context on the source */
|
|
|
|
+ cudaSetDevice(src);
|
|
|
|
+
|
|
|
|
+ /* Allocate a buffer on the device */
|
|
|
|
+ unsigned char *s_buffer;
|
|
|
|
+ cudaMalloc((void **)&s_buffer, size);
|
|
|
|
+ STARPU_ASSERT(s_buffer);
|
|
|
|
+ cudaMemset(s_buffer, 0, size);
|
|
|
|
+
|
|
|
|
+ /* Initiliaze CUDA context on the destination */
|
|
|
|
+ cudaSetDevice(dst);
|
|
|
|
+
|
|
|
|
+ /* Allocate a buffer on the device */
|
|
|
|
+ unsigned char *d_buffer;
|
|
|
|
+ cudaMalloc((void **)&d_buffer, size);
|
|
|
|
+ STARPU_ASSERT(d_buffer);
|
|
|
|
+ cudaMemset(d_buffer, 0, size);
|
|
|
|
+
|
|
|
|
+ unsigned iter;
|
|
|
|
+ double timing;
|
|
|
|
+ struct timeval start;
|
|
|
|
+ struct timeval end;
|
|
|
|
+
|
|
|
|
+ /* Measure upload bandwidth */
|
|
|
|
+ gettimeofday(&start, NULL);
|
|
|
|
+ for (iter = 0; iter < NITER; iter++)
|
|
|
|
+ {
|
|
|
|
+ cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
|
|
|
|
+ cudaThreadSynchronize();
|
|
|
|
+ }
|
|
|
|
+ gettimeofday(&end, NULL);
|
|
|
|
+ timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
+
|
|
|
|
+ cudadev_timing_dtod[src+1][dst+1] = timing/NITER/size;
|
|
|
|
+
|
|
|
|
+ /* Free buffers */
|
|
|
|
+ cudaFree(d_buffer);
|
|
|
|
+ cudaSetDevice(src);
|
|
|
|
+ cudaFree(s_buffer);
|
|
|
|
+
|
|
|
|
+ cudaThreadExit();
|
|
|
|
+}
|
|
|
|
+#endif
|
|
#endif
|
|
#endif
|
|
|
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
@@ -176,8 +236,9 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
|
|
cl_context context;
|
|
cl_context context;
|
|
cl_command_queue queue;
|
|
cl_command_queue queue;
|
|
cl_int err=0;
|
|
cl_int err=0;
|
|
|
|
+ size_t size = SIZE;
|
|
|
|
|
|
- struct starpu_machine_config_s *config = _starpu_get_machine_config();
|
|
|
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
|
|
|
|
/* Initialize OpenCL context on the device */
|
|
/* Initialize OpenCL context on the device */
|
|
@@ -191,28 +252,28 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
|
|
starpu_opencl_get_device(dev, &device);
|
|
starpu_opencl_get_device(dev, &device);
|
|
err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
|
|
err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
- if (opencl_size > (size_t)maxMemAllocSize/4) opencl_size = maxMemAllocSize/4;
|
|
|
|
|
|
+ if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
|
|
|
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
|
|
|
|
/* Allocate a buffer on the device */
|
|
/* Allocate a buffer on the device */
|
|
cl_mem d_buffer;
|
|
cl_mem d_buffer;
|
|
- d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, opencl_size, NULL, &err);
|
|
|
|
|
|
+ d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
/* Allocate a buffer on the host */
|
|
/* Allocate a buffer on the host */
|
|
unsigned char *h_buffer;
|
|
unsigned char *h_buffer;
|
|
- h_buffer = malloc(opencl_size);
|
|
|
|
- assert(h_buffer);
|
|
|
|
|
|
+ h_buffer = (unsigned char *)malloc(size);
|
|
|
|
+ STARPU_ASSERT(h_buffer);
|
|
|
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
/* Fill them */
|
|
/* Fill them */
|
|
- memset(h_buffer, 0, opencl_size);
|
|
|
|
- err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, opencl_size, h_buffer, 0, NULL, NULL);
|
|
|
|
|
|
+ memset(h_buffer, 0, size);
|
|
|
|
+ err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
@@ -226,25 +287,25 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
|
|
gettimeofday(&start, NULL);
|
|
gettimeofday(&start, NULL);
|
|
for (iter = 0; iter < NITER; iter++)
|
|
for (iter = 0; iter < NITER; iter++)
|
|
{
|
|
{
|
|
- err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, opencl_size, h_buffer, 0, NULL, NULL);
|
|
|
|
|
|
+ err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
}
|
|
}
|
|
gettimeofday(&end, NULL);
|
|
gettimeofday(&end, NULL);
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
|
|
|
|
|
|
+ dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
|
|
|
|
|
|
/* Measure download bandwidth */
|
|
/* Measure download bandwidth */
|
|
gettimeofday(&start, NULL);
|
|
gettimeofday(&start, NULL);
|
|
for (iter = 0; iter < NITER; iter++)
|
|
for (iter = 0; iter < NITER; iter++)
|
|
{
|
|
{
|
|
- err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, opencl_size, h_buffer, 0, NULL, NULL);
|
|
|
|
|
|
+ err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
|
|
}
|
|
}
|
|
gettimeofday(&end, NULL);
|
|
gettimeofday(&end, NULL);
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
|
|
|
|
|
|
+ dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
|
|
|
|
|
|
/* Free buffers */
|
|
/* Free buffers */
|
|
clReleaseMemObject(d_buffer);
|
|
clReleaseMemObject(d_buffer);
|
|
@@ -258,8 +319,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
|
|
/* NB: we want to sort the bandwidth by DECREASING order */
|
|
/* NB: we want to sort the bandwidth by DECREASING order */
|
|
static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
|
|
static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
|
|
{
|
|
{
|
|
- const struct dev_timing *left = left_dev_timing;
|
|
|
|
- const struct dev_timing *right = right_dev_timing;
|
|
|
|
|
|
+ const struct dev_timing *left = (const struct dev_timing *)left_dev_timing;
|
|
|
|
+ const struct dev_timing *right = (const struct dev_timing *)right_dev_timing;
|
|
|
|
|
|
double left_dtoh = left->timing_dtoh;
|
|
double left_dtoh = left->timing_dtoh;
|
|
double left_htod = left->timing_htod;
|
|
double left_htod = left->timing_htod;
|
|
@@ -291,7 +352,7 @@ static int find_numa_node(hwloc_obj_t obj)
|
|
|
|
|
|
STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
|
|
STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
|
|
|
|
|
|
- return current->logical_index;
|
|
|
|
|
|
+ return current->logical_index;
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
|
|
@@ -308,12 +369,24 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
|
|
/* If no NUMA node was found, we assume that we have a single memory
|
|
/* If no NUMA node was found, we assume that we have a single memory
|
|
* bank. */
|
|
* bank. */
|
|
const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
|
|
const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
|
|
-
|
|
|
|
- unsigned is_available_per_numa_node[nnuma_nodes];
|
|
|
|
- double dev_timing_htod_per_numa_node[nnuma_nodes];
|
|
|
|
- double dev_timing_dtoh_per_numa_node[nnuma_nodes];
|
|
|
|
|
|
|
|
- memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
|
|
|
|
|
|
+ unsigned *is_available_per_numa_node = NULL;
|
|
|
|
+ double *dev_timing_htod_per_numa_node = NULL;
|
|
|
|
+ double *dev_timing_dtoh_per_numa_node = NULL;
|
|
|
|
+
|
|
|
|
+ if (!no_node_obj_was_found)
|
|
|
|
+ {
|
|
|
|
+ is_available_per_numa_node = (unsigned *)malloc(nnuma_nodes * sizeof(unsigned));
|
|
|
|
+ STARPU_ASSERT(is_available_per_numa_node);
|
|
|
|
+
|
|
|
|
+ dev_timing_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
|
|
|
|
+ STARPU_ASSERT(dev_timing_htod_per_numa_node);
|
|
|
|
+
|
|
|
|
+ dev_timing_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
|
|
|
|
+ STARPU_ASSERT(dev_timing_dtoh_per_numa_node);
|
|
|
|
+
|
|
|
|
+ memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
|
|
|
|
+ }
|
|
#endif
|
|
#endif
|
|
|
|
|
|
unsigned cpu;
|
|
unsigned cpu;
|
|
@@ -327,9 +400,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
|
|
if (!no_node_obj_was_found)
|
|
if (!no_node_obj_was_found)
|
|
{
|
|
{
|
|
hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
|
|
hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
|
|
-
|
|
|
|
|
|
+
|
|
numa_id = find_numa_node(obj);
|
|
numa_id = find_numa_node(obj);
|
|
-
|
|
|
|
|
|
+
|
|
if (is_available_per_numa_node[numa_id])
|
|
if (is_available_per_numa_node[numa_id])
|
|
{
|
|
{
|
|
/* We reuse the previous numbers for that NUMA node */
|
|
/* We reuse the previous numbers for that NUMA node */
|
|
@@ -364,6 +437,15 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+#ifdef STARPU_HAVE_HWLOC
|
|
|
|
+ if (!no_node_obj_was_found)
|
|
|
|
+ {
|
|
|
|
+ free(is_available_per_numa_node);
|
|
|
|
+ free(dev_timing_htod_per_numa_node);
|
|
|
|
+ free(dev_timing_dtoh_per_numa_node);
|
|
|
|
+ }
|
|
|
|
+#endif /* STARPU_HAVE_HWLOC */
|
|
}
|
|
}
|
|
|
|
|
|
static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
|
|
static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
|
|
@@ -386,7 +468,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
|
|
|
|
|
|
double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
|
|
double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
|
|
|
|
|
|
- _STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
|
|
|
|
|
|
+ _STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
|
|
}
|
|
}
|
|
|
|
|
|
unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
|
|
unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
|
|
@@ -405,6 +487,9 @@ static void benchmark_all_gpu_devices(void)
|
|
{
|
|
{
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
int i;
|
|
int i;
|
|
|
|
+#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
|
+ int j;
|
|
|
|
+#endif
|
|
|
|
|
|
_STARPU_DEBUG("Benchmarking the speed of the bus\n");
|
|
_STARPU_DEBUG("Benchmarking the speed of the bus\n");
|
|
|
|
|
|
@@ -428,21 +513,33 @@ static void benchmark_all_gpu_devices(void)
|
|
#warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
|
|
#warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
|
|
#endif
|
|
#endif
|
|
|
|
|
|
- struct starpu_machine_config_s *config = _starpu_get_machine_config();
|
|
|
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
ncpus = _starpu_topology_get_nhwcpu(config);
|
|
ncpus = _starpu_topology_get_nhwcpu(config);
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
#ifdef STARPU_USE_CUDA
|
|
- cudaGetDeviceCount(&ncuda);
|
|
|
|
|
|
+ ncuda = _starpu_get_cuda_device_count();
|
|
for (i = 0; i < ncuda; i++)
|
|
for (i = 0; i < ncuda; i++)
|
|
{
|
|
{
|
|
|
|
+ fprintf(stderr," CUDA %d...", i);
|
|
/* measure bandwidth between Host and Device i */
|
|
/* measure bandwidth between Host and Device i */
|
|
measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
|
|
measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
|
|
}
|
|
}
|
|
|
|
+#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
|
+ for (i = 0; i < ncuda; i++)
|
|
|
|
+ for (j = 0; j < ncuda; j++)
|
|
|
|
+ if (i != j)
|
|
|
|
+ {
|
|
|
|
+ fprintf(stderr," CUDA %d -> %d...", i, j);
|
|
|
|
+ /* measure bandwidth between Host and Device i */
|
|
|
|
+ measure_bandwidth_between_dev_and_dev_cuda(i, j);
|
|
|
|
+ }
|
|
|
|
+#endif
|
|
#endif
|
|
#endif
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
nopencl = _starpu_opencl_get_device_count();
|
|
nopencl = _starpu_opencl_get_device_count();
|
|
for (i = 0; i < nopencl; i++)
|
|
for (i = 0; i < nopencl; i++)
|
|
{
|
|
{
|
|
|
|
+ fprintf(stderr," OpenCL %d...", i);
|
|
/* measure bandwith between Host and Device i */
|
|
/* measure bandwith between Host and Device i */
|
|
measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
|
|
measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
|
|
}
|
|
}
|
|
@@ -477,7 +574,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
|
|
char hostname[32];
|
|
char hostname[32];
|
|
char *forced_hostname = getenv("STARPU_HOSTNAME");
|
|
char *forced_hostname = getenv("STARPU_HOSTNAME");
|
|
if (forced_hostname && forced_hostname[0])
|
|
if (forced_hostname && forced_hostname[0])
|
|
- snprintf(hostname, sizeof(hostname), forced_hostname);
|
|
|
|
|
|
+ snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
|
|
else
|
|
else
|
|
gethostname(hostname, sizeof(hostname));
|
|
gethostname(hostname, sizeof(hostname));
|
|
strncat(path, ".", maxlen);
|
|
strncat(path, ".", maxlen);
|
|
@@ -495,6 +592,7 @@ static void get_affinity_path(char *path, size_t maxlen)
|
|
|
|
|
|
static void load_bus_affinity_file_content(void)
|
|
static void load_bus_affinity_file_content(void)
|
|
{
|
|
{
|
|
|
|
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
FILE *f;
|
|
FILE *f;
|
|
|
|
|
|
char path[256];
|
|
char path[256];
|
|
@@ -503,13 +601,12 @@ static void load_bus_affinity_file_content(void)
|
|
f = fopen(path, "r");
|
|
f = fopen(path, "r");
|
|
STARPU_ASSERT(f);
|
|
STARPU_ASSERT(f);
|
|
|
|
|
|
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
|
|
- struct starpu_machine_config_s *config = _starpu_get_machine_config();
|
|
|
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
ncpus = _starpu_topology_get_nhwcpu(config);
|
|
ncpus = _starpu_topology_get_nhwcpu(config);
|
|
int gpu;
|
|
int gpu;
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
#ifdef STARPU_USE_CUDA
|
|
- cudaGetDeviceCount(&ncuda);
|
|
|
|
|
|
+ ncuda = _starpu_get_cuda_device_count();
|
|
for (gpu = 0; gpu < ncuda; gpu++)
|
|
for (gpu = 0; gpu < ncuda; gpu++)
|
|
{
|
|
{
|
|
int ret;
|
|
int ret;
|
|
@@ -532,7 +629,7 @@ static void load_bus_affinity_file_content(void)
|
|
ret = fscanf(f, "\n");
|
|
ret = fscanf(f, "\n");
|
|
STARPU_ASSERT(ret == 0);
|
|
STARPU_ASSERT(ret == 0);
|
|
}
|
|
}
|
|
-#endif
|
|
|
|
|
|
+#endif /* !STARPU_USE_CUDA */
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
nopencl = _starpu_opencl_get_device_count();
|
|
nopencl = _starpu_opencl_get_device_count();
|
|
for (gpu = 0; gpu < nopencl; gpu++)
|
|
for (gpu = 0; gpu < nopencl; gpu++)
|
|
@@ -557,21 +654,21 @@ static void load_bus_affinity_file_content(void)
|
|
ret = fscanf(f, "\n");
|
|
ret = fscanf(f, "\n");
|
|
STARPU_ASSERT(ret == 0);
|
|
STARPU_ASSERT(ret == 0);
|
|
}
|
|
}
|
|
-#endif
|
|
|
|
-#endif
|
|
|
|
|
|
+#endif /* !STARPU_USE_OPENCL */
|
|
|
|
|
|
fclose(f);
|
|
fclose(f);
|
|
|
|
+#endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
|
|
|
|
+
|
|
}
|
|
}
|
|
|
|
|
|
static void write_bus_affinity_file_content(void)
|
|
static void write_bus_affinity_file_content(void)
|
|
{
|
|
{
|
|
- FILE *f;
|
|
|
|
-
|
|
|
|
STARPU_ASSERT(was_benchmarked);
|
|
STARPU_ASSERT(was_benchmarked);
|
|
|
|
|
|
|
|
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
|
|
+ FILE *f;
|
|
char path[256];
|
|
char path[256];
|
|
get_affinity_path(path, 256);
|
|
get_affinity_path(path, 256);
|
|
-
|
|
|
|
f = fopen(path, "w+");
|
|
f = fopen(path, "w+");
|
|
if (!f)
|
|
if (!f)
|
|
{
|
|
{
|
|
@@ -581,7 +678,6 @@ static void write_bus_affinity_file_content(void)
|
|
STARPU_ABORT();
|
|
STARPU_ABORT();
|
|
}
|
|
}
|
|
|
|
|
|
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
|
|
unsigned cpu;
|
|
unsigned cpu;
|
|
int gpu;
|
|
int gpu;
|
|
|
|
|
|
@@ -689,12 +785,14 @@ static int load_bus_latency_file_content(void)
|
|
double latency;
|
|
double latency;
|
|
|
|
|
|
n = fscanf(f, "%lf", &latency);
|
|
n = fscanf(f, "%lf", &latency);
|
|
- if (n != 1) {
|
|
|
|
|
|
+ if (n != 1)
|
|
|
|
+ {
|
|
fclose(f);
|
|
fclose(f);
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
n = getc(f);
|
|
n = getc(f);
|
|
- if (n != '\t') {
|
|
|
|
|
|
+ if (n != '\t')
|
|
|
|
+ {
|
|
fclose(f);
|
|
fclose(f);
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
@@ -703,7 +801,8 @@ static int load_bus_latency_file_content(void)
|
|
}
|
|
}
|
|
|
|
|
|
n = getc(f);
|
|
n = getc(f);
|
|
- if (n != '\n') {
|
|
|
|
|
|
+ if (n != '\n')
|
|
|
|
+ {
|
|
fclose(f);
|
|
fclose(f);
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
@@ -750,17 +849,19 @@ static void write_bus_latency_file_content(void)
|
|
if ((src > maxnode) || (dst > maxnode))
|
|
if ((src > maxnode) || (dst > maxnode))
|
|
{
|
|
{
|
|
/* convention */
|
|
/* convention */
|
|
- latency = -1.0;
|
|
|
|
|
|
+ latency = NAN;
|
|
}
|
|
}
|
|
else if (src == dst)
|
|
else if (src == dst)
|
|
{
|
|
{
|
|
latency = 0.0;
|
|
latency = 0.0;
|
|
}
|
|
}
|
|
- else {
|
|
|
|
|
|
+ else
|
|
|
|
+ {
|
|
|
|
+ /* µs */
|
|
latency = ((src && dst)?2000.0:500.0);
|
|
latency = ((src && dst)?2000.0:500.0);
|
|
}
|
|
}
|
|
|
|
|
|
- fprintf(f, "%lf\t", latency);
|
|
|
|
|
|
+ fprintf(f, "%f\t", latency);
|
|
}
|
|
}
|
|
|
|
|
|
fprintf(f, "\n");
|
|
fprintf(f, "\n");
|
|
@@ -828,13 +929,15 @@ static int load_bus_bandwidth_file_content(void)
|
|
double bandwidth;
|
|
double bandwidth;
|
|
|
|
|
|
n = fscanf(f, "%lf", &bandwidth);
|
|
n = fscanf(f, "%lf", &bandwidth);
|
|
- if (n != 1) {
|
|
|
|
|
|
+ if (n != 1)
|
|
|
|
+ {
|
|
fprintf(stderr,"didn't get a number\n");
|
|
fprintf(stderr,"didn't get a number\n");
|
|
fclose(f);
|
|
fclose(f);
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
n = getc(f);
|
|
n = getc(f);
|
|
- if (n != '\t') {
|
|
|
|
|
|
+ if (n != '\t')
|
|
|
|
+ {
|
|
fclose(f);
|
|
fclose(f);
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
@@ -843,7 +946,8 @@ static int load_bus_bandwidth_file_content(void)
|
|
}
|
|
}
|
|
|
|
|
|
n = getc(f);
|
|
n = getc(f);
|
|
- if (n != '\n') {
|
|
|
|
|
|
+ if (n != '\n')
|
|
|
|
+ {
|
|
fclose(f);
|
|
fclose(f);
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
@@ -883,36 +987,43 @@ static void write_bus_bandwidth_file_content(void)
|
|
|
|
|
|
if ((src > maxnode) || (dst > maxnode))
|
|
if ((src > maxnode) || (dst > maxnode))
|
|
{
|
|
{
|
|
- bandwidth = -1.0;
|
|
|
|
|
|
+ bandwidth = NAN;
|
|
}
|
|
}
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
else if (src != dst)
|
|
else if (src != dst)
|
|
{
|
|
{
|
|
- double time_src_to_ram=0.0, time_ram_to_dst=0.0;
|
|
|
|
- double timing;
|
|
|
|
- /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
|
|
|
|
|
|
+ double slowness = 0.0;
|
|
|
|
+ /* Total bandwidth is the harmonic mean of bandwidths */
|
|
#ifdef STARPU_USE_CUDA
|
|
#ifdef STARPU_USE_CUDA
|
|
- time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
|
|
|
|
- time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
|
|
|
|
- timing =time_src_to_ram + time_ram_to_dst;
|
|
|
|
- bandwidth = 1.0*cuda_size/timing;
|
|
|
|
|
|
+#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
|
+ if (src && src <= ncuda && dst && dst <= ncuda)
|
|
|
|
+ /* Direct GPU-GPU transfert */
|
|
|
|
+ slowness = cudadev_timing_dtod[src][dst];
|
|
|
|
+ else
|
|
|
|
+#endif
|
|
|
|
+ {
|
|
|
|
+ if (src && src <= ncuda)
|
|
|
|
+ slowness += cudadev_timing_dtoh[src];
|
|
|
|
+ if (dst && dst <= ncuda)
|
|
|
|
+ slowness += cudadev_timing_htod[dst];
|
|
|
|
+ }
|
|
#endif
|
|
#endif
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
- if (src > ncuda)
|
|
|
|
- time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
|
|
|
|
- if (dst > ncuda)
|
|
|
|
- time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
|
|
|
|
- timing =time_src_to_ram + time_ram_to_dst;
|
|
|
|
- bandwidth = 1.0*opencl_size/timing;
|
|
|
|
|
|
+ if (src > ncuda)
|
|
|
|
+ slowness += opencldev_timing_dtoh[src-ncuda];
|
|
|
|
+ if (dst > ncuda)
|
|
|
|
+ slowness += opencldev_timing_htod[dst-ncuda];
|
|
#endif
|
|
#endif
|
|
|
|
+ bandwidth = 1.0/slowness;
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
- else {
|
|
|
|
|
|
+ else
|
|
|
|
+ {
|
|
/* convention */
|
|
/* convention */
|
|
bandwidth = 0.0;
|
|
bandwidth = 0.0;
|
|
}
|
|
}
|
|
|
|
|
|
- fprintf(f, "%lf\t", bandwidth);
|
|
|
|
|
|
+ fprintf(f, "%f\t", bandwidth);
|
|
}
|
|
}
|
|
|
|
|
|
fprintf(f, "\n");
|
|
fprintf(f, "\n");
|
|
@@ -921,37 +1032,38 @@ static void write_bus_bandwidth_file_content(void)
|
|
fclose(f);
|
|
fclose(f);
|
|
}
|
|
}
|
|
|
|
|
|
-void starpu_print_bus_bandwidth(FILE *f)
|
|
|
|
|
|
+void starpu_bus_print_bandwidth(FILE *f)
|
|
{
|
|
{
|
|
- int src, dst, maxnode;
|
|
|
|
|
|
+ int src, dst, maxnode;
|
|
|
|
|
|
- maxnode = ncuda;
|
|
|
|
|
|
+ maxnode = ncuda;
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
- maxnode += nopencl;
|
|
|
|
|
|
+ maxnode += nopencl;
|
|
#endif
|
|
#endif
|
|
|
|
|
|
- fprintf(f, "from\t");
|
|
|
|
- fprintf(f, "to RAM\t\t");
|
|
|
|
- for (dst = 0; dst < ncuda; dst++)
|
|
|
|
- fprintf(f, "to CUDA %d\t", dst);
|
|
|
|
- for (dst = 0; dst < nopencl; dst++)
|
|
|
|
- fprintf(f, "to OpenCL %d\t", dst);
|
|
|
|
- fprintf(f, "\n");
|
|
|
|
-
|
|
|
|
- for (src = 0; src <= maxnode; src++)
|
|
|
|
- {
|
|
|
|
- if (!src)
|
|
|
|
- fprintf(f, "RAM\t");
|
|
|
|
- else if (src <= ncuda)
|
|
|
|
- fprintf(f, "CUDA %d\t", src-1);
|
|
|
|
- else
|
|
|
|
- fprintf(f, "OpenCL%d\t", src-ncuda-1);
|
|
|
|
- for (dst = 0; dst <= maxnode; dst++)
|
|
|
|
- fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
|
|
|
|
-
|
|
|
|
- fprintf(f, "\n");
|
|
|
|
- }
|
|
|
|
|
|
+ fprintf(f, "from\t");
|
|
|
|
+ fprintf(f, "to RAM\t\t");
|
|
|
|
+ for (dst = 0; dst < ncuda; dst++)
|
|
|
|
+ fprintf(f, "to CUDA %d\t", dst);
|
|
|
|
+ for (dst = 0; dst < nopencl; dst++)
|
|
|
|
+ fprintf(f, "to OpenCL %d\t", dst);
|
|
|
|
+ fprintf(f, "\n");
|
|
|
|
+
|
|
|
|
+ for (src = 0; src <= maxnode; src++)
|
|
|
|
+ {
|
|
|
|
+ if (!src)
|
|
|
|
+ fprintf(f, "RAM\t");
|
|
|
|
+ else if (src <= ncuda)
|
|
|
|
+ fprintf(f, "CUDA %d\t", src-1);
|
|
|
|
+ else
|
|
|
|
+ fprintf(f, "OpenCL%d\t", src-ncuda-1);
|
|
|
|
+ for (dst = 0; dst <= maxnode; dst++)
|
|
|
|
+ fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
|
|
|
|
+
|
|
|
|
+ fprintf(f, "\n");
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
+
|
|
static void generate_bus_bandwidth_file(void)
|
|
static void generate_bus_bandwidth_file(void)
|
|
{
|
|
{
|
|
if (!was_benchmarked)
|
|
if (!was_benchmarked)
|
|
@@ -990,16 +1102,18 @@ static void check_bus_config_file()
|
|
|
|
|
|
get_config_path(path, 256);
|
|
get_config_path(path, 256);
|
|
res = access(path, F_OK);
|
|
res = access(path, F_OK);
|
|
- if (res) {
|
|
|
|
|
|
+ if (res)
|
|
|
|
+ {
|
|
fprintf(stderr, "No performance model for the bus, calibrating...");
|
|
fprintf(stderr, "No performance model for the bus, calibrating...");
|
|
starpu_force_bus_sampling();
|
|
starpu_force_bus_sampling();
|
|
fprintf(stderr, "done\n");
|
|
fprintf(stderr, "done\n");
|
|
}
|
|
}
|
|
- else {
|
|
|
|
|
|
+ else
|
|
|
|
+ {
|
|
FILE *f;
|
|
FILE *f;
|
|
int ret, read_cuda, read_opencl;
|
|
int ret, read_cuda, read_opencl;
|
|
unsigned read_cpus;
|
|
unsigned read_cpus;
|
|
- struct starpu_machine_config_s *config = _starpu_get_machine_config();
|
|
|
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
|
|
|
|
// Loading configuration from file
|
|
// Loading configuration from file
|
|
f = fopen(path, "r");
|
|
f = fopen(path, "r");
|
|
@@ -1019,24 +1133,27 @@ static void check_bus_config_file()
|
|
// Loading current configuration
|
|
// Loading current configuration
|
|
ncpus = _starpu_topology_get_nhwcpu(config);
|
|
ncpus = _starpu_topology_get_nhwcpu(config);
|
|
#ifdef STARPU_USE_CUDA
|
|
#ifdef STARPU_USE_CUDA
|
|
- cudaGetDeviceCount(&ncuda);
|
|
|
|
|
|
+ ncuda = _starpu_get_cuda_device_count();
|
|
#endif
|
|
#endif
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
nopencl = _starpu_opencl_get_device_count();
|
|
nopencl = _starpu_opencl_get_device_count();
|
|
#endif
|
|
#endif
|
|
|
|
|
|
// Checking if both configurations match
|
|
// Checking if both configurations match
|
|
- if (read_cpus != ncpus) {
|
|
|
|
|
|
+ if (read_cpus != ncpus)
|
|
|
|
+ {
|
|
fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
|
|
fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
|
|
starpu_force_bus_sampling();
|
|
starpu_force_bus_sampling();
|
|
fprintf(stderr, "done\n");
|
|
fprintf(stderr, "done\n");
|
|
}
|
|
}
|
|
- else if (read_cuda != ncuda) {
|
|
|
|
|
|
+ else if (read_cuda != ncuda)
|
|
|
|
+ {
|
|
fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
|
|
fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
|
|
starpu_force_bus_sampling();
|
|
starpu_force_bus_sampling();
|
|
fprintf(stderr, "done\n");
|
|
fprintf(stderr, "done\n");
|
|
}
|
|
}
|
|
- else if (read_opencl != nopencl) {
|
|
|
|
|
|
+ else if (read_opencl != nopencl)
|
|
|
|
+ {
|
|
fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
|
|
fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
|
|
starpu_force_bus_sampling();
|
|
starpu_force_bus_sampling();
|
|
fprintf(stderr, "done\n");
|
|
fprintf(stderr, "done\n");
|
|
@@ -1094,11 +1211,12 @@ void _starpu_load_bus_performance_files(void)
|
|
load_bus_bandwidth_file();
|
|
load_bus_bandwidth_file();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/* (in µs) */
|
|
double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
|
|
double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
|
|
{
|
|
{
|
|
double bandwidth = bandwidth_matrix[src_node][dst_node];
|
|
double bandwidth = bandwidth_matrix[src_node][dst_node];
|
|
double latency = latency_matrix[src_node][dst_node];
|
|
double latency = latency_matrix[src_node][dst_node];
|
|
- struct starpu_machine_topology_s *topology = &_starpu_get_machine_config()->topology;
|
|
|
|
|
|
+ struct starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
|
|
|
|
|
|
return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
|
|
return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
|
|
}
|
|
}
|