瀏覽代碼

Fix setting the proper CUDA stream when using several streams from the same driver thread

Samuel Thibault 8 年之前
父節點
當前提交
a41080d2a6

+ 8 - 0
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -67,6 +67,14 @@ starpu_cublas_init() will initialize CUBLAS on every CUDA device
 controlled by StarPU. This call blocks until CUBLAS has been properly
 initialized on every device.
 
+\fn void starpu_cublas_set_stream(void)
+\ingroup API_CUDA_Extensions
+This function sets the proper cublas stream. This must be called from the CUDA
+codelet before calling cublas kernels, so that they are queued on the proper
+CUDA stream. When using one thread per CUDA worker, this function does not
+do anything since the cublas stream does not change, and is set once by
+starpu_cublas_init().
+
 \fn void starpu_cublas_shutdown(void)
 \ingroup API_CUDA_Extensions
 This function synchronously deinitializes the CUBLAS library on

+ 1 - 0
examples/audio/starpu_audio_processing.c

@@ -198,6 +198,7 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 	localout = plans[workerid].localout;
 
+	starpu_cublas_set_stream();
 	/* FFT */
 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);

+ 1 - 0
examples/axpy/axpy.c

@@ -74,6 +74,7 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
+	starpu_cublas_set_stream();
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
 }
 #endif

+ 6 - 0
examples/cg/cg_kernels.c

@@ -81,6 +81,7 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
+	starpu_cublas_set_stream();
 	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 #endif
@@ -120,6 +121,7 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
+	starpu_cublas_set_stream();
 	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 #endif
@@ -335,6 +337,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
  
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
+	starpu_cublas_set_stream();
 	cublasscal(n, alpha, v1, 1);
 }
 #endif
@@ -389,6 +392,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 	/* Compute v1 = alpha M v2 + beta v1 */
+	starpu_cublas_set_stream();
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 }
 #endif
@@ -504,6 +508,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 *	v1 = p1 v1
 	 *	v1 = v1 + p2 v2
 	 */
+	starpu_cublas_set_stream();
 	cublasscal(n, p1, v1, 1);
 	cublasaxpy(n, p2, v2, 1, v1, 1);
 }
@@ -584,6 +589,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  
 	/* Compute v1 = v1 + p1 * v2.
 	 */
+	starpu_cublas_set_stream();
 	cublasaxpy(n, p1, v2, 1, v1, 1);
 }
 #endif

+ 6 - 0
examples/cholesky/cholesky_kernels.c

@@ -80,6 +80,8 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_HAVE_MAGMA
 		cublasSetKernelStream(starpu_cuda_get_local_stream());
+#else
+		starpu_cublas_set_stream();
 #endif
 		cublasSgemm('n', 't', dy, dx, dz, 
 				-1.0f, left, ld21, right, ld12, 
@@ -129,6 +131,8 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 		case 1:
 #ifdef STARPU_HAVE_MAGMA
 			cublasSetKernelStream(starpu_cuda_get_local_stream());
+#else
+			starpu_cublas_set_stream();
 #endif
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
@@ -205,6 +209,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
 			cublasSetKernelStream(starpu_cuda_get_local_stream());
 			magmablasSetKernelStream(starpu_cuda_get_local_stream());
+#else
+			starpu_cublas_set_stream();
 #endif
 			ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
 			if (ret != MAGMA_SUCCESS)

+ 4 - 0
examples/heat/dw_factolu_kernels.c

@@ -134,6 +134,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
 					right, ld12, 1.0f, center, ld22);
 			status = cublasGetError();
@@ -197,6 +198,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
 					1.0f, sub11, ld11, sub12, ld12);
 			status = cublasGetError();
@@ -258,6 +260,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
@@ -338,6 +341,7 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			for (z = 0; z < nx; z++)
 			{
 				float pivot;

+ 6 - 0
examples/heat/dw_sparse_cg_kernels.c

@@ -146,6 +146,7 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	dot = cublasSdot (size, vec, 1, vec, 1);
 
 	pb->delta_new = dot;
@@ -238,6 +239,7 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	dot = cublasSdot (size, vecd, 1, vecq, 1);
 
 	pb->alpha = pb->delta_new / dot;
@@ -281,6 +283,7 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
 }
 #endif
@@ -320,6 +323,7 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
 }
 #endif
@@ -363,6 +367,7 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	dot = cublasSdot (size, vecr, 1, vecr, 1);
 
 	pb->delta_old = pb->delta_new;
@@ -411,6 +416,7 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	/* d = beta d */
 	cublasSscal(size, pb->beta, vecd, 1);
 

+ 6 - 0
examples/lu/xlu_kernels.c

@@ -65,6 +65,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 #ifdef STARPU_USE_CUDA
 		case 1:
 		{
+			starpu_cublas_set_stream();
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
 				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
 				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
@@ -185,6 +186,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
 					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
 
@@ -271,6 +273,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
 					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
 
@@ -366,6 +369,7 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			for (z = 0; z < nx; z++)
 			{
 				TYPE pivot;
@@ -496,6 +500,7 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			for (z = 0; z < nx; z++)
 			{
 				TYPE pivot;
@@ -614,6 +619,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			for (row = 0; row < nx; row++)
 			{
 				unsigned rowpiv = ipiv[row+first] - first;

+ 1 - 0
examples/mult/xgemm.c

@@ -161,6 +161,7 @@ static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 
+	starpu_cublas_set_stream();
 	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
 				     (TYPE)0.0, subC, ldC);
 }

+ 2 - 0
examples/pipeline/pipeline.c

@@ -101,6 +101,7 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	cublasSaxpy(n, 1., x, 1, y, 1);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
@@ -143,6 +144,7 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	float y;
 
+	starpu_cublas_set_stream();
 	y = cublasSasum(n, x, 1);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());

+ 2 - 0
examples/reductions/dot_product.c

@@ -256,6 +256,8 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
+	if (cublas_version >= 7050)
+		starpu_cublas_set_stream();
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 
 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */

+ 1 - 0
examples/spmv/dw_block_spmv_kernels.c

@@ -43,6 +43,7 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+			starpu_cublas_set_stream();
 			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
 			break;
 #endif

+ 1 - 0
include/starpu_cublas.h

@@ -24,6 +24,7 @@ extern "C"
 #endif
 
 void starpu_cublas_init(void);
+void starpu_cublas_set_stream(void);
 void starpu_cublas_shutdown(void);
 
 #ifdef __cplusplus

+ 10 - 0
src/drivers/cuda/starpu_cublas.c

@@ -73,3 +73,13 @@ void starpu_cublas_shutdown(void)
 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
 #endif
 }
+
+void starpu_cublas_set_stream(void)
+{
+#ifdef STARPU_USE_CUDA
+	if (
+		(!_starpu_get_machine_config()->topology.cuda_th_per_stream &&
+		 _starpu_get_machine_config()->topology.nworkerpercuda > 1))
+		cublasSetKernelStream(starpu_cuda_get_local_stream());
+#endif
+}

+ 1 - 0
tests/microbenchs/matrix_as_vector.c

@@ -55,6 +55,7 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 
+	starpu_cublas_set_stream();
 	float sum = cublasSasum(nx, matrix, 1);
 	sum /= nx;