7 years ago · babb4e46f0
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2011-2014,2016,2017                      Inria
															
 
																- * Copyright (C) 2009-2018                                Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2019                                Université de Bordeaux
															
 
																  * Copyright (C) 2010-2017                                CNRS
															
 
																  * Copyright (C) 2013                                     Corentin Salingue
															
 
																  *
															
@@ -217,6 +217,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
																 	/* Fill them */
															
 
																 	memset(h_buffer, 0, size);
															
 
																 	cudaMemset(d_buffer, 0, size);
															
 
																+	cudaThreadSynchronize();
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
 
																 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
															
@@ -335,6 +336,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	cures = cudaMalloc((void **)&s_buffer, size);
															
 
																 	STARPU_ASSERT(cures == cudaSuccess);
															
 
																 	cudaMemset(s_buffer, 0, size);
															
 
																+	cudaThreadSynchronize();
															
 
																 	/* Initialize CUDA context on the destination */
															
 
																 	/* We do not need to enable OpenGL interoperability at this point,
															
@@ -360,6 +362,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	cures = cudaMalloc((void **)&d_buffer, size);
															
 
																 	STARPU_ASSERT(cures == cudaSuccess);
															
 
																 	cudaMemset(d_buffer, 0, size);
															
 
																+	cudaThreadSynchronize();
															
 
																 	unsigned iter;
															
 
																 	double timing;
															
--- a/tests/datawizard/manual_reduction.c
+++ b/tests/datawizard/manual_reduction.c
@@ -74,7 +74,7 @@ static void initialize_per_worker_handle(void *arg)
 
																 			{
															
 
																 				STARPU_CUDA_REPORT_ERROR(status);
															
 
																 			}
															
 
																-			status = cudaMemset((void *)per_worker[workerid], 0, sizeof(variable));
															
 
																+			status = cudaMemsetAsync((void *)per_worker[workerid], 0, sizeof(variable), starpu_cuda_get_local_stream());
															
 
																 			if (status)
															
 
																 				STARPU_CUDA_REPORT_ERROR(status);
															
 
																 			break;