|
@@ -1,7 +1,7 @@
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
*
|
|
*
|
|
* Copyright (C) 2011-2014,2016,2017 Inria
|
|
* Copyright (C) 2011-2014,2016,2017 Inria
|
|
- * Copyright (C) 2009-2018 Université de Bordeaux
|
|
|
|
|
|
+ * Copyright (C) 2009-2019 Université de Bordeaux
|
|
* Copyright (C) 2010-2017 CNRS
|
|
* Copyright (C) 2010-2017 CNRS
|
|
* Copyright (C) 2013 Corentin Salingue
|
|
* Copyright (C) 2013 Corentin Salingue
|
|
*
|
|
*
|
|
@@ -217,6 +217,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
|
|
/* Fill them */
|
|
/* Fill them */
|
|
memset(h_buffer, 0, size);
|
|
memset(h_buffer, 0, size);
|
|
cudaMemset(d_buffer, 0, size);
|
|
cudaMemset(d_buffer, 0, size);
|
|
|
|
+ cudaThreadSynchronize();
|
|
|
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
/* hack to avoid third party libs to rebind threads */
|
|
_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
|
|
_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
|
|
@@ -335,6 +336,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
|
|
cures = cudaMalloc((void **)&s_buffer, size);
|
|
cures = cudaMalloc((void **)&s_buffer, size);
|
|
STARPU_ASSERT(cures == cudaSuccess);
|
|
STARPU_ASSERT(cures == cudaSuccess);
|
|
cudaMemset(s_buffer, 0, size);
|
|
cudaMemset(s_buffer, 0, size);
|
|
|
|
+ cudaThreadSynchronize();
|
|
|
|
|
|
/* Initialize CUDA context on the destination */
|
|
/* Initialize CUDA context on the destination */
|
|
/* We do not need to enable OpenGL interoperability at this point,
|
|
/* We do not need to enable OpenGL interoperability at this point,
|
|
@@ -360,6 +362,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
|
|
cures = cudaMalloc((void **)&d_buffer, size);
|
|
cures = cudaMalloc((void **)&d_buffer, size);
|
|
STARPU_ASSERT(cures == cudaSuccess);
|
|
STARPU_ASSERT(cures == cudaSuccess);
|
|
cudaMemset(d_buffer, 0, size);
|
|
cudaMemset(d_buffer, 0, size);
|
|
|
|
+ cudaThreadSynchronize();
|
|
|
|
|
|
unsigned iter;
|
|
unsigned iter;
|
|
double timing;
|
|
double timing;
|