Ver código fonte

Fix setting the proper CUDA stream when using several streams from the same driver thread

Samuel Thibault 8 anos atrás
pai
commit
a41080d2a6

+ 8 - 0
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -67,6 +67,14 @@ starpu_cublas_init() will initialize CUBLAS on every CUDA device
 controlled by StarPU. This call blocks until CUBLAS has been properly
 controlled by StarPU. This call blocks until CUBLAS has been properly
 initialized on every device.
 initialized on every device.
 
 
+\fn void starpu_cublas_set_stream(void)
+\ingroup API_CUDA_Extensions
+This function sets the proper cublas stream. This must be called from the CUDA
+codelet before calling cublas kernels, so that they are queued on the proper
+CUDA stream. When using one thread per CUDA worker, this function does not
+do anything since the cublas stream does not change, and is set once by
+starpu_cublas_init().
+
 \fn void starpu_cublas_shutdown(void)
 \fn void starpu_cublas_shutdown(void)
 \ingroup API_CUDA_Extensions
 \ingroup API_CUDA_Extensions
 This function synchronously deinitializes the CUBLAS library on
 This function synchronously deinitializes the CUBLAS library on

+ 1 - 0
examples/audio/starpu_audio_processing.c

@@ -198,6 +198,7 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 
 	localout = plans[workerid].localout;
 	localout = plans[workerid].localout;
 
 
+	starpu_cublas_set_stream();
 	/* FFT */
 	/* FFT */
 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);

+ 1 - 0
examples/axpy/axpy.c

@@ -74,6 +74,7 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 
+	starpu_cublas_set_stream();
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
 }
 }
 #endif
 #endif

+ 6 - 0
examples/cg/cg_kernels.c

@@ -81,6 +81,7 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
  
+	starpu_cublas_set_stream();
 	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
 	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 }
 #endif
 #endif
@@ -120,6 +121,7 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
  
+	starpu_cublas_set_stream();
 	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
 	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 }
 #endif
 #endif
@@ -335,6 +337,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
  
  
 	/* v1 = p1 v1 */
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
 	TYPE alpha = p1;
+	starpu_cublas_set_stream();
 	cublasscal(n, alpha, v1, 1);
 	cublasscal(n, alpha, v1, 1);
 }
 }
 #endif
 #endif
@@ -389,6 +392,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	/* Compute v1 = alpha M v2 + beta v1 */
+	starpu_cublas_set_stream();
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 }
 }
 #endif
 #endif
@@ -504,6 +508,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 *	v1 = p1 v1
 	 *	v1 = p1 v1
 	 *	v1 = v1 + p2 v2
 	 *	v1 = v1 + p2 v2
 	 */
 	 */
+	starpu_cublas_set_stream();
 	cublasscal(n, p1, v1, 1);
 	cublasscal(n, p1, v1, 1);
 	cublasaxpy(n, p2, v2, 1, v1, 1);
 	cublasaxpy(n, p2, v2, 1, v1, 1);
 }
 }
@@ -584,6 +589,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  
  
 	/* Compute v1 = v1 + p1 * v2.
 	/* Compute v1 = v1 + p1 * v2.
 	 */
 	 */
+	starpu_cublas_set_stream();
 	cublasaxpy(n, p1, v2, 1, v1, 1);
 	cublasaxpy(n, p1, v2, 1, v1, 1);
 }
 }
 #endif
 #endif

+ 6 - 0
examples/cholesky/cholesky_kernels.c

@@ -80,6 +80,8 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_HAVE_MAGMA
 #ifdef STARPU_HAVE_MAGMA
 		cublasSetKernelStream(starpu_cuda_get_local_stream());
 		cublasSetKernelStream(starpu_cuda_get_local_stream());
+#else
+		starpu_cublas_set_stream();
 #endif
 #endif
 		cublasSgemm('n', 't', dy, dx, dz, 
 		cublasSgemm('n', 't', dy, dx, dz, 
 				-1.0f, left, ld21, right, ld12, 
 				-1.0f, left, ld21, right, ld12, 
@@ -129,6 +131,8 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 		case 1:
 		case 1:
 #ifdef STARPU_HAVE_MAGMA
 #ifdef STARPU_HAVE_MAGMA
 			cublasSetKernelStream(starpu_cuda_get_local_stream());
 			cublasSetKernelStream(starpu_cuda_get_local_stream());
+#else
+			starpu_cublas_set_stream();
 #endif
 #endif
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 			break;
@@ -205,6 +209,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
 			cublasSetKernelStream(starpu_cuda_get_local_stream());
 			cublasSetKernelStream(starpu_cuda_get_local_stream());
 			magmablasSetKernelStream(starpu_cuda_get_local_stream());
 			magmablasSetKernelStream(starpu_cuda_get_local_stream());
+#else
+			starpu_cublas_set_stream();
 #endif
 #endif
 			ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
 			ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
 			if (ret != MAGMA_SUCCESS)
 			if (ret != MAGMA_SUCCESS)

+ 4 - 0
examples/heat/dw_factolu_kernels.c

@@ -134,6 +134,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
 			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
 					right, ld12, 1.0f, center, ld22);
 					right, ld12, 1.0f, center, ld22);
 			status = cublasGetError();
 			status = cublasGetError();
@@ -197,6 +198,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
 			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
 					1.0f, sub11, ld11, sub12, ld12);
 					1.0f, sub11, ld11, sub12, ld12);
 			status = cublasGetError();
 			status = cublasGetError();
@@ -258,6 +260,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
 			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
 			status = cublasGetError();
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
 			if (status != CUBLAS_STATUS_SUCCESS)
@@ -338,6 +341,7 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				float pivot;
 				float pivot;

+ 6 - 0
examples/heat/dw_sparse_cg_kernels.c

@@ -146,6 +146,7 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	dot = cublasSdot (size, vec, 1, vec, 1);
 	dot = cublasSdot (size, vec, 1, vec, 1);
 
 
 	pb->delta_new = dot;
 	pb->delta_new = dot;
@@ -238,6 +239,7 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	dot = cublasSdot (size, vecd, 1, vecq, 1);
 	dot = cublasSdot (size, vecd, 1, vecq, 1);
 
 
 	pb->alpha = pb->delta_new / dot;
 	pb->alpha = pb->delta_new / dot;
@@ -281,6 +283,7 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
 	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
 }
 }
 #endif
 #endif
@@ -320,6 +323,7 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
 	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
 }
 }
 #endif
 #endif
@@ -363,6 +367,7 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	dot = cublasSdot (size, vecr, 1, vecr, 1);
 	dot = cublasSdot (size, vecr, 1, vecr, 1);
 
 
 	pb->delta_old = pb->delta_new;
 	pb->delta_old = pb->delta_new;
@@ -411,6 +416,7 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	/* d = beta d */
 	/* d = beta d */
 	cublasSscal(size, pb->beta, vecd, 1);
 	cublasSscal(size, pb->beta, vecd, 1);
 
 

+ 6 - 0
examples/lu/xlu_kernels.c

@@ -65,6 +65,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
 		{
 		{
+			starpu_cublas_set_stream();
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
 				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
 				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
 				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
 				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
@@ -185,6 +186,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
 					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
 					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
 
 
@@ -271,6 +273,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
 					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
 					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
 
 
@@ -366,6 +369,7 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				TYPE pivot;
 				TYPE pivot;
@@ -496,6 +500,7 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				TYPE pivot;
 				TYPE pivot;
@@ -614,6 +619,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			for (row = 0; row < nx; row++)
 			for (row = 0; row < nx; row++)
 			{
 			{
 				unsigned rowpiv = ipiv[row+first] - first;
 				unsigned rowpiv = ipiv[row+first] - first;

+ 1 - 0
examples/mult/xgemm.c

@@ -161,6 +161,7 @@ static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 
 
+	starpu_cublas_set_stream();
 	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
 	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
 				     (TYPE)0.0, subC, ldC);
 				     (TYPE)0.0, subC, ldC);
 }
 }

+ 2 - 0
examples/pipeline/pipeline.c

@@ -101,6 +101,7 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	cublasSaxpy(n, 1., x, 1, y, 1);
 	cublasSaxpy(n, 1., x, 1, y, 1);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
@@ -143,6 +144,7 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	float y;
 	float y;
 
 
+	starpu_cublas_set_stream();
 	y = cublasSasum(n, x, 1);
 	y = cublasSasum(n, x, 1);
 
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());

+ 2 - 0
examples/reductions/dot_product.c

@@ -256,6 +256,8 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
+	if (cublas_version >= 7050)
+		starpu_cublas_set_stream();
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 
 
 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */

+ 1 - 0
examples/spmv/dw_block_spmv_kernels.c

@@ -43,6 +43,7 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+			starpu_cublas_set_stream();
 			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
 			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
 			break;
 			break;
 #endif
 #endif

+ 1 - 0
include/starpu_cublas.h

@@ -24,6 +24,7 @@ extern "C"
 #endif
 #endif
 
 
 void starpu_cublas_init(void);
 void starpu_cublas_init(void);
+void starpu_cublas_set_stream(void);
 void starpu_cublas_shutdown(void);
 void starpu_cublas_shutdown(void);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 10 - 0
src/drivers/cuda/starpu_cublas.c

@@ -73,3 +73,13 @@ void starpu_cublas_shutdown(void)
 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
 #endif
 #endif
 }
 }
+
+void starpu_cublas_set_stream(void)
+{
+#ifdef STARPU_USE_CUDA
+	if (
+		(!_starpu_get_machine_config()->topology.cuda_th_per_stream &&
+		 _starpu_get_machine_config()->topology.nworkerpercuda > 1))
+		cublasSetKernelStream(starpu_cuda_get_local_stream());
+#endif
+}

+ 1 - 0
tests/microbenchs/matrix_as_vector.c

@@ -55,6 +55,7 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
+	starpu_cublas_set_stream();
 	float sum = cublasSasum(nx, matrix, 1);
 	float sum = cublasSasum(nx, matrix, 1);
 	sum /= nx;
 	sum /= nx;