Samuel Thibault пре 8 година
родитељ
комит
441644b632

+ 1 - 0
ChangeLog

@@ -60,6 +60,7 @@ New features:
   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
     models between devices, making calibration much faster.
   * Add modular-heft-prio scheduler.
+  * Add starpu_cublas_get_local_handle helper.
 
 Changes:
   * Fix performance regression of lws for small tasks.

+ 1 - 0
Makefile.am

@@ -98,6 +98,7 @@ versinclude_HEADERS = 				\
 	include/starpu_rand.h			\
 	include/starpu_disk.h			\
 	include/starpu_cublas.h			\
+	include/starpu_cublas_v2.h		\
 	include/starpu_driver.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\

+ 3 - 2
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -62,10 +62,11 @@ Unfortunately, some CUDA libraries do not have stream variants of
 kernels. That will lower the potential for overlapping.
 
 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
-CUBLAS library. Some libraries like Magma may however change the current stream,
+CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
-stream.
+stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
+kernels with the proper configuration.
 
 If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then

+ 9 - 3
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -69,10 +69,16 @@ initialized on every device.
 
 \fn void starpu_cublas_set_stream(void)
 \ingroup API_CUDA_Extensions
-This function sets the proper cublas stream. This must be called from the CUDA
-codelet before calling cublas kernels, so that they are queued on the proper
+This function sets the proper CUBLAS stream for CUBLAS v1. This must be called from the CUDA
+codelet before calling CUBLAS v1 kernels, so that they are queued on the proper
 CUDA stream. When using one thread per CUDA worker, this function does not
-do anything since the cublas stream does not change, and is set once by
+do anything since the CUBLAS stream does not change, and is set once by
+starpu_cublas_init().
+
+\fn cublasHandle_t starpu_cublas_get_local_handle(void)
+\ingroup API_CUDA_Extensions
+This function returns the CUBLAS v2 handle to be used to queue CUBLAS v2
+kernels. It is properly initialized and configured for multistream by
 starpu_cublas_init().
 
 \fn void starpu_cublas_shutdown(void)

+ 5 - 2
examples/audio/starpu_audio_processing.c

@@ -32,6 +32,7 @@
 #include <fftw3.h>
 #ifdef STARPU_USE_CUDA
 #include <cufft.h>
+#include <starpu_cublas_v2.h>
 #endif
 
 /* #define SAVE_RAW	1 */
@@ -198,7 +199,6 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 	localout = plans[workerid].localout;
 
-	starpu_cublas_set_stream();
 	/* FFT */
 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
@@ -216,7 +216,10 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
 	/* FFTW does not normalize its output ! */
-	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
+	float scal = 1.0f/nsamples;
+	cublasStatus_t status = cublasSscal (starpu_cublas_local_handle(), nsamples, &scal, localA, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 

+ 4 - 3
examples/axpy/axpy.c

@@ -30,7 +30,7 @@
 #include <common/blas.h>
 
 #ifdef STARPU_USE_CUDA
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
 #endif
 
 #include "axpy.h"
@@ -74,8 +74,9 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
-	starpu_cublas_set_stream();
-	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
+	cublasStatus_t status = CUBLASAXPY(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 

+ 0 - 1
examples/cg/cg.c

@@ -21,7 +21,6 @@
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
-#include <cublas.h>
 #endif
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

+ 51 - 18
examples/cg/cg_kernels.c

@@ -22,6 +22,12 @@
 #include <math.h>
 #include <limits.h>
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cublas_v2.h>
+static const TYPE p1 = 1.0;
+static const TYPE m1 = -1.0;
+#endif
+
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 {
@@ -81,8 +87,9 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
-	starpu_cublas_set_stream();
-	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), 1, &p1, v_src, 1, v_dst, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 
@@ -120,9 +127,10 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
- 
-	starpu_cublas_set_stream();
-	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
+
+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p1, v_src, 1, v_dst, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 
@@ -249,10 +257,26 @@ static void dot_kernel_cuda(void *descr[], void *cl_arg)
 
 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
 
-	/* Contrary to cublasSdot, this function puts its result directly in
-	 * device memory, so that we don't have to transfer that value back and
-	 * forth. */
-	dot_host(v1, v2, n, dot);
+	int version;
+	cublasGetVersion(starpu_cublas_get_local_handle(), &version);
+
+	/* FIXME: check in Nvidia bug #1882017 when this gets fixed */
+	if (version < 99999)
+	{
+		/* This function puts its result directly in device memory, so
+		 * that we don't have to transfer that value back and forth. */
+		dot_host(v1, v2, n, dot);
+	}
+	else
+	{
+		/* Should be able to put result in GPU, but does not yet, see
+		 * Nvidia bug #1882017 */
+		cublasStatus_t status = cublasdot(starpu_cublas_get_local_handle(),
+			n, v1, 1, v2, 1, dot);
+		if (status != CUBLAS_STATUS_SUCCESS)
+			STARPU_CUBLAS_REPORT_ERROR(status);
+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	}
 }
 #endif
 
@@ -337,8 +361,9 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
  
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
-	starpu_cublas_set_stream();
-	cublasscal(n, alpha, v1, 1);
+	cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 
@@ -392,8 +417,10 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 	/* Compute v1 = alpha M v2 + beta v1 */
-	starpu_cublas_set_stream();
-	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
+	cublasStatus_t status = cublasgemv(starpu_cublas_get_local_handle(),
+			CUBLAS_OP_N, nx, ny, &alpha, M, ld, v2, 1, &beta, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 
@@ -508,9 +535,13 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 *	v1 = p1 v1
 	 *	v1 = v1 + p2 v2
 	 */
-	starpu_cublas_set_stream();
-	cublasscal(n, p1, v1, 1);
-	cublasaxpy(n, p2, v2, 1, v1, 1);
+	cublasStatus_t status;
+	status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 
@@ -589,8 +620,10 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  
 	/* Compute v1 = v1 + p1 * v2.
 	 */
-	starpu_cublas_set_stream();
-	cublasaxpy(n, p1, v2, 1, v1, 1);
+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
+			n, &p1, v2, 1, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 

+ 0 - 1
examples/cholesky/cholesky.h

@@ -24,7 +24,6 @@
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <cublas.h>
 #endif
 
 #include <common/blas.h>

+ 42 - 24
examples/cholesky/cholesky_kernels.c

@@ -25,15 +25,24 @@
 #include <starpu.h>
 #include "cholesky.h"
 #include "../common/blas.h"
-#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
+#if defined(STARPU_USE_CUDA)
+#include <cublas.h>
+#include <starpu_cublas_v2.h>
+#if defined(STARPU_HAVE_MAGMA)
 #include "magma.h"
 #include "magma_lapack.h"
 #endif
+#endif
 
 /*
  *   U22 
  */
 
+#if defined(STARPU_USE_CUDA)
+static const float p1 =  1.0;
+static const float m1 = -1.0;
+#endif
+
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	/* printf("22\n"); */
@@ -78,14 +87,12 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 	{
 		/* CUDA kernel */
 #ifdef STARPU_USE_CUDA
-#ifdef STARPU_HAVE_MAGMA
-		cublasSetKernelStream(starpu_cuda_get_local_stream());
-#else
-		starpu_cublas_set_stream();
-#endif
-		cublasSgemm('n', 't', dy, dx, dz, 
-				-1.0f, left, ld21, right, ld12, 
-				 1.0f, center, ld22);
+		cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
+				CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz, 
+				&m1, left, ld21, right, ld12, 
+				&p1, center, ld22);
+		if (status != CUBLAS_STATUS_SUCCESS)
+			STARPU_CUBLAS_REPORT_ERROR(status);
 #endif
 
 	}
@@ -122,6 +129,10 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+#endif
+
 	switch (s)
 	{
 		case 0:
@@ -129,12 +140,11 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-#ifdef STARPU_HAVE_MAGMA
-			cublasSetKernelStream(starpu_cuda_get_local_stream());
-#else
-			starpu_cublas_set_stream();
-#endif
-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			status = cublasStrsm(starpu_cublas_get_local_handle(),
+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
+					nx21, ny21, &p1, sub11, ld11, sub21, ld21);
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_CUBLAS_REPORT_ERROR(status);
 			break;
 #endif
 		default:
@@ -206,9 +216,10 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 			{
 			int ret;
 			int info;
+			cudaStream_t stream = starpu_cuda_get_local_stream();
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
-			cublasSetKernelStream(starpu_cuda_get_local_stream());
-			magmablasSetKernelStream(starpu_cuda_get_local_stream());
+			cublasSetKernelStream(stream);
+			magmablasSetKernelStream(stream);
 #else
 			starpu_cublas_set_stream();
 #endif
@@ -219,7 +230,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 				STARPU_ABORT();
 			}
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaError_t cures = cudaStreamSynchronize(stream);
 #else
 			cudaError_t cures = cudaThreadSynchronize();
 #endif
@@ -229,29 +240,36 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 			{
 
 			float *lambda11;
+			cublasStatus_t status;
+			cudaStream_t stream = starpu_cuda_get_local_stream();
+			cublasHandle_t handle = starpu_cublas_get_local_handle();
 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
 
 			for (z = 0; z < nx; z++)
 			{
 				
-				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
+				cudaStreamSynchronize(stream);
 
 				STARPU_ASSERT(*lambda11 != 0.0f);
 				
 				*lambda11 = sqrt(*lambda11);
 
 /*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
-				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, stream);
+				float scal = 1.0f/(*lambda11);
 
-				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
+				status = cublasSscal(handle,
+						nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
 
-				cublasSsyr('U', nx - z - 1, -1.0f,
+				status = cublasSsyr(handle,
+							CUBLAS_FILL_MODE_UPPER,
+							nx - z - 1, &m1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 			cudaFreeHost(lambda11);
 			}
 #endif

+ 3 - 3
examples/heat/dw_factolu.c

@@ -72,8 +72,8 @@ static struct starpu_codelet cl12 =
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u12"},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
 	.model = &model_12
@@ -85,8 +85,8 @@ static struct starpu_codelet cl21 =
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u21"},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
 	.model = &model_21
@@ -98,8 +98,8 @@ static struct starpu_codelet cl22 =
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u22"},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &model_22

+ 0 - 1
examples/heat/dw_factolu.h

@@ -25,7 +25,6 @@
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <cublas.h>
 #endif
 
 #include "../common/blas.h"

+ 3 - 0
examples/heat/dw_factolu_grain.c

@@ -99,6 +99,7 @@ static struct starpu_codelet cl12 =
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.model = &model_12
 };
@@ -144,6 +145,7 @@ static struct starpu_codelet cl21 =
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.model = &model_21
 };
@@ -186,6 +188,7 @@ static struct starpu_codelet cl22 =
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
 	.model = &model_22
 };

+ 34 - 17
examples/heat/dw_factolu_kernels.c

@@ -20,6 +20,13 @@
  */
 #include "dw_factolu.h"
 
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#include <starpu_cublas_v2.h>
+static const float p1 =  1.0;
+static const float m1 = -1.0;
+#endif
+
 unsigned count_11_per_worker[STARPU_NMAXWORKERS] = {0};
 unsigned count_12_per_worker[STARPU_NMAXWORKERS] = {0};
 unsigned count_21_per_worker[STARPU_NMAXWORKERS] = {0};
@@ -134,10 +141,10 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
-			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
-					right, ld12, 1.0f, center, ld22);
-			status = cublasGetError();
+			status = cublasSgemm(starpu_cublas_get_local_handle(),
+					CUBLAS_OP_N, CUBLAS_OP_N,
+					dx, dy, dz, &m1, left, ld21,
+					right, ld12, &p1, center, ld22);
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
@@ -198,10 +205,10 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
-			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
-					1.0f, sub11, ld11, sub12, ld12);
-			status = cublasGetError();
+			status = cublasStrsm(starpu_cublas_get_local_handle(),
+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
+					ny12, nx12,
+					&p1, sub11, ld11, sub12, ld12);
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
@@ -260,9 +267,9 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
-			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
-			status = cublasGetError();
+			status = cublasStrsm(starpu_cublas_get_local_handle(),
+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
+					ny21, nx21, &p1, sub11, ld11, sub21, ld21);
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
@@ -322,6 +329,12 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
 	unsigned long z;
 
+#ifdef STARPU_USE_CUDA
+	cudaStream_t stream;
+	cublasHandle_t handle;
+	cublasStatus_t status;
+#endif
+
 	switch (s)
 	{
 		case 0:
@@ -341,24 +354,28 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
+			stream = starpu_cuda_get_local_stream();
+			handle = starpu_cublas_get_local_handle();
 			for (z = 0; z < nx; z++)
 			{
 				float pivot;
-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
+				cudaStreamSynchronize(stream);
 
 				STARPU_ASSERT(pivot != 0.0f);
+				float scal = 1.0f/pivot;
 
-				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
+				status = cublasSscal(starpu_cublas_get_local_handle(),
+						nx - z - 1, &scal, &sub11[z+(z+1)*ld], ld);
 
-				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
+				status = cublasSger(starpu_cublas_get_local_handle(),
+						nx - z - 1, nx - z - 1, &m1,
 								&sub11[z+(z+1)*ld], ld,
 								&sub11[(z+1)+z*ld], 1,
 								&sub11[(z+1) + (z+1)*ld],ld);
 			}
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 
 			break;
 #endif

+ 3 - 0
examples/heat/dw_sparse_cg.c

@@ -241,6 +241,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
 #ifdef STARPU_USE_CUDA
 	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
+	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
 	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
@@ -259,6 +260,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
 #ifdef STARPU_USE_CUDA
 	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
+	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
 	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
@@ -292,6 +294,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
 #ifdef STARPU_USE_CUDA
 	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
+	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
 	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";

+ 0 - 4
examples/heat/dw_sparse_cg.h

@@ -29,10 +29,6 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
-
 #include "../common/blas.h"
 
 #define MAXITER	100000

+ 30 - 13
examples/heat/dw_sparse_cg_kernels.c

@@ -17,6 +17,10 @@
 
 #include "dw_sparse_cg.h"
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cublas_v2.h>
+#endif
+
 /*
  *	Algorithm :
  *		
@@ -146,8 +150,10 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	starpu_cublas_set_stream();
-	dot = cublasSdot (size, vec, 1, vec, 1);
+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vec, 1, vec, 1, &dot);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	pb->delta_new = dot;
 	pb->delta_0 = dot;
@@ -239,8 +245,10 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	starpu_cublas_set_stream();
-	dot = cublasSdot (size, vecd, 1, vecq, 1);
+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecd, 1, vecq, 1, &dot);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	pb->alpha = pb->delta_new / dot;
 }
@@ -283,8 +291,9 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	starpu_cublas_set_stream();
-	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &pb->alpha, vecd, 1, vecx, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 
@@ -323,8 +332,11 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	starpu_cublas_set_stream();
-	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
+	float scal = -pb->alpha;
+
+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecq, 1, vecr, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 
@@ -367,8 +379,8 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	starpu_cublas_set_stream();
-	dot = cublasSdot (size, vecr, 1, vecr, 1);
+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	pb->delta_old = pb->delta_new;
 	pb->delta_new = dot;
@@ -416,11 +428,16 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
-	starpu_cublas_set_stream();
 	/* d = beta d */
-	cublasSscal(size, pb->beta, vecd, 1);
+	cublasStatus_t status;
+	status = cublasSscal(starpu_cublas_get_local_handle(), size, &pb->beta, vecd, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 
 	/* d = r + d */
-	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
+	float scal = 1.0f;
+	status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecr, 1, vecd, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif

+ 0 - 3
examples/lu/xlu.h

@@ -20,9 +20,6 @@
 
 #include <starpu.h>
 #include <common/blas.h>
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
 
 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
 #define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\

+ 86 - 32
examples/lu/xlu_kernels.c

@@ -21,6 +21,11 @@
 #include <math.h>
 #include <complex.h>
 
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#include <starpu_cublas_v2.h>
+#endif
+
 #define str(s) #s
 #define xstr(s)        str(s)
 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
@@ -65,12 +70,11 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 #ifdef STARPU_USE_CUDA
 		case 1:
 		{
-			starpu_cublas_set_stream();
-			CUBLAS_GEMM('n', 'n', dx, dy, dz,
-				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
-				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
+			status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
+				CUBLAS_OP_N, CUBLAS_OP_N, dx, dy, dz,
+				(CUBLAS_TYPE *)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
+				(CUBLAS_TYPE *)&p1, (CUBLAS_TYPE *)center, ld22);
 
-			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
@@ -186,11 +190,11 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
-			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
+					ny12, nx12,
+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
 
-			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
@@ -273,11 +277,11 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
-			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
+					ny21, nx21,
+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
 
-			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
@@ -345,6 +349,12 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
 	unsigned long z;
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cublasHandle_t handle;
+	cudaStream_t stream;
+#endif
+
 	switch (s)
 	{
 		case 0:
@@ -369,13 +379,14 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
+			handle = starpu_cublas_get_local_handle();
+			stream = starpu_cuda_get_local_stream();
 			for (z = 0; z < nx; z++)
 			{
 				TYPE pivot;
 				TYPE inv_pivot;
-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
+				cudaStreamSynchronize(stream);
 
 #ifdef COMPLEX_LU
 				STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
@@ -385,15 +396,23 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 #endif
 				
 				inv_pivot = 1.0/pivot;
-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				status = CUBLAS_SCAL(handle,
+						nx - z - 1,
+						(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+				status = CUBLAS_GER(handle,
+						nx - z - 1, nx - z - 1,
+						(CUBLAS_TYPE*)&m1,
 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 			}
 			
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 
 			break;
 #endif
@@ -462,6 +481,12 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 	unsigned *ipiv = piv->piv;
 	unsigned first = piv->first;
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cublasHandle_t handle;
+	cudaStream_t stream;
+#endif
+
 	switch (s)
 	{
 		case 0:
@@ -500,44 +525,63 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
+			handle = starpu_cublas_get_local_handle();
+			stream = starpu_cuda_get_local_stream();
 			for (z = 0; z < nx; z++)
 			{
 				TYPE pivot;
 				TYPE inv_pivot;
-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
+				cudaStreamSynchronize(stream);
 
 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
 				{
 					/* find the pivot */
-					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
+					int piv_ind;
+					status = CUBLAS_IAMAX(handle,
+						nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld, &piv_ind);
+					piv_ind -= 1;
+					if (status != CUBLAS_STATUS_SUCCESS)
+						STARPU_CUBLAS_REPORT_ERROR(status);
 	
 					ipiv[z + first] = piv_ind + z + first;
 
 					/* swap if needed */
 					if (piv_ind != 0)
 					{
-						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
+						status = CUBLAS_SWAP(handle,
+							nx,
+							(CUBLAS_TYPE*)&sub11[z*ld], 1,
+							(CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
+						if (status != CUBLAS_STATUS_SUCCESS)
+							STARPU_CUBLAS_REPORT_ERROR(status);
 					}
 
-					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-					cudaStreamSynchronize(starpu_cuda_get_local_stream());
+					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
+					cudaStreamSynchronize(stream);
 				}
 
 				STARPU_ASSERT(pivot != 0.0);
 				
 				inv_pivot = 1.0/pivot;
-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				status = CUBLAS_SCAL(handle,
+						nx - z - 1,
+						(CUBLAS_TYPE*)&inv_pivot,
+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+				status = CUBLAS_GER(handle,
+						nx - z - 1, nx - z - 1,
+						(CUBLAS_TYPE*)&m1,
 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
-				
+				if (status != CUBLAS_STATUS_SUCCESS)
+						STARPU_CUBLAS_REPORT_ERROR(status);
 			}
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 
 			break;
 #endif
@@ -605,6 +649,11 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 	unsigned *ipiv = piv->piv;
 	unsigned first = piv->first;
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cublasHandle_t handle;
+#endif
+
 	switch (s)
 	{
 		case 0:
@@ -619,13 +668,18 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
+			handle = starpu_cublas_get_local_handle();
 			for (row = 0; row < nx; row++)
 			{
 				unsigned rowpiv = ipiv[row+first] - first;
 				if (rowpiv != row)
 				{
-					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
+					status = CUBLAS_SWAP(handle,
+							nx,
+							(CUBLAS_TYPE*)&matrix[row*ld], 1,
+							(CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
+					if (status != CUBLAS_STATUS_SUCCESS)
+						STARPU_CUBLAS_REPORT_ERROR(status);
 				}
 			}
 

+ 11 - 4
examples/mult/xgemm.c

@@ -36,7 +36,10 @@
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
+static const TYPE p1 = 1.0;
+static const TYPE m1 = -1.0;
+static const TYPE v0 = 0.0;
 #endif
 
 static unsigned niter = 10;
@@ -161,9 +164,13 @@ static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 
-	starpu_cublas_set_stream();
-	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
-				     (TYPE)0.0, subC, ldC);
+	cublasStatus_t status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
+			CUBLAS_OP_N, CUBLAS_OP_N,
+			nxC, nyC, nyA,
+			&p1, subA, ldA, subB, ldB,
+			&v0, subC, ldC);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 #endif
 

+ 6 - 7
examples/pipeline/pipeline.c

@@ -35,7 +35,7 @@
 #include <common/blas.h>
 
 #ifdef STARPU_USE_CUDA
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
 #endif
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
@@ -100,10 +100,9 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	float alpha = 1.;
 
-	starpu_cublas_set_stream();
-	cublasSaxpy(n, 1., x, 1, y, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	cublasSaxpy(starpu_cublas_get_local_handle(), n, &alpha, x, 1, y, 1);
 }
 #endif
 
@@ -119,6 +118,7 @@ static struct starpu_codelet pipeline_codelet_axpy =
 	.cpu_funcs_name = {"pipeline_cpu_axpy"},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {pipeline_cublas_axpy},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -144,10 +144,8 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	float y;
 
-	starpu_cublas_set_stream();
-	y = cublasSasum(n, x, 1);
+	cublasSasum(starpu_cublas_get_local_handle(), n, x, 1, &y);
 
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
 }
 #endif
@@ -164,6 +162,7 @@ static struct starpu_codelet pipeline_codelet_sum =
 	.cpu_funcs_name = {"pipeline_cpu_sum"},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {pipeline_cublas_sum},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 1,
 	.modes = {STARPU_R},

+ 13 - 11
examples/reductions/dot_product.c

@@ -29,7 +29,7 @@
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
 #endif
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
@@ -245,7 +245,7 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 void dot_cuda_func(void *descr[], void *cl_arg)
 {
 	DOT_TYPE current_dot;
-	DOT_TYPE local_dot;
+	float local_dot;
 
 	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -256,9 +256,10 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
-	if (cublas_version >= 7050)
-		starpu_cublas_set_stream();
-	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
+	cublasStatus_t status = cublasSdot(starpu_cublas_get_local_handle(), n, local_x, 1, local_y, 1, &local_dot);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
 	current_dot += local_dot;
@@ -358,14 +359,15 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef STARPU_USE_CUDA
-	/* cublasSdot has synchronization issues when using a non-blocking stream (Nvidia bugid 1669886) */
-	cublasGetVersion(&cublas_version);
+	cublasHandle_t handle;
+	cublasCreate(&handle);
+	cublasGetVersion(handle, &cublas_version);
+	cublasDestroy(handle);
 	if (cublas_version >= 7050)
 		starpu_cublas_init();
-	if (starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1) > 1
-	 && starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0) == 1)
-		/* Disable the sdot cublas kernel, it is bogus with concurrent
-		 * multistream execution (Nvidia bugid 1881192) */
+	else
+		/* Disable the sdot cublas kernel, it is bogus with a
+		 * non-blocking stream (Nvidia bugid 1669886) */
 		dot_codelet.cuda_funcs[0] = NULL;
 #endif
 

+ 0 - 4
examples/sched_ctx/gpu_partition.c

@@ -26,10 +26,6 @@
 
 #include <common/blas.h>
 
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
-
 
 #define N	512*512
 #define NITER   100

+ 2 - 0
examples/spmv/dw_block_spmv.c

@@ -167,6 +167,7 @@ struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_block_spmv},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW}
 };
@@ -320,6 +321,7 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_cublas_init();
 
 	sem_init(&sem, 0, 0U);
 

+ 0 - 4
examples/spmv/dw_block_spmv.h

@@ -28,10 +28,6 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
-
 void cpu_block_spmv(void *descr[], void *_args);
 
 #ifdef STARPU_USE_CUDA

+ 10 - 2
examples/spmv/dw_block_spmv_kernels.c

@@ -24,6 +24,12 @@
  *   U22 
  */
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cublas_v2.h>
+static const float p1 =  1.0;
+static const float m1 = -1.0;
+#endif
+
 static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	/* printf("22\n"); */
@@ -43,8 +49,10 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			starpu_cublas_set_stream();
-			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
+			cublasStatus_t status = cublasSgemv (starpu_cublas_get_local_handle(),
+					CUBLAS_OP_T, dx, dy, &p1, block, ld, in, 1, &p1, out, 1);
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_CUBLAS_REPORT_ERROR(status);
 			break;
 #endif
 		default:

+ 34 - 0
include/starpu_cublas_v2.h

@@ -0,0 +1,34 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_CUBLAS_V2_H__
+#define __STARPU_CUBLAS_V2_H__
+
+#include <cublas_v2.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+cublasHandle_t starpu_cublas_get_local_handle(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_CUBLAS_V2_H__ */

+ 0 - 1
src/datawizard/copy_driver.h

@@ -29,7 +29,6 @@
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <cublas.h>
 #endif
 
 #ifdef STARPU_USE_OPENCL

+ 21 - 0
src/drivers/cuda/starpu_cublas.c

@@ -22,8 +22,11 @@
 
 #ifdef STARPU_USE_CUDA
 #include <cublas.h>
+#include <starpu_cublas_v2.h>
 
 static int cublas_initialized[STARPU_NMAXWORKERS];
+static cublasHandle_t cublas_handles[STARPU_NMAXWORKERS];
+static cublasHandle_t main_handle;
 static starpu_pthread_mutex_t mutex;
 
 static unsigned get_idx(void) {
@@ -51,6 +54,9 @@ static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 			STARPU_CUBLAS_REPORT_ERROR(cublasst);
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	cublasCreate(&cublas_handles[starpu_worker_get_id_check()]);
+	cublasSetStream(cublas_handles[starpu_worker_get_id_check()], starpu_cuda_get_local_stream());
 }
 
 static void set_cublas_stream_func(void *args STARPU_ATTRIBUTE_UNUSED)
@@ -65,6 +71,8 @@ static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 	if (!--cublas_initialized[idx])
 		cublasShutdown();
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	cublasDestroy(cublas_handles[starpu_worker_get_id_check()]);
 }
 #endif
 
@@ -73,6 +81,8 @@ void starpu_cublas_init(void)
 #ifdef STARPU_USE_CUDA
 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
 	starpu_execute_on_each_worker(set_cublas_stream_func, NULL, STARPU_CUDA);
+
+	cublasCreate(&main_handle);
 #endif
 }
 
@@ -80,6 +90,8 @@ void starpu_cublas_shutdown(void)
 {
 #ifdef STARPU_USE_CUDA
 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
+
+	cublasDestroy(main_handle);
 #endif
 }
 
@@ -92,3 +104,12 @@ void starpu_cublas_set_stream(void)
 		cublasSetKernelStream(starpu_cuda_get_local_stream());
 #endif
 }
+
+cublasHandle_t starpu_cublas_get_local_handle(void)
+{
+	int workerid = starpu_worker_get_id();
+	if (workerid >= 0)
+		return cublas_handles[workerid];
+	else
+		return main_handle;
+}

+ 9 - 14
tests/microbenchs/matrix_as_vector.c

@@ -18,7 +18,7 @@
 #include "../helper.h"
 
 #ifdef STARPU_USE_CUDA
-#  include <cublas.h>
+#  include <starpu_cublas_v2.h>
 #endif
 
 /*
@@ -55,8 +55,9 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 
-	starpu_cublas_set_stream();
-	float sum = cublasSasum(nx, matrix, 1);
+	float sum;
+	cublasSasum(starpu_cublas_get_local_handle(), nx, matrix, 1, &sum);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	sum /= nx;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
@@ -87,7 +88,9 @@ void matrix_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 
-	float sum = cublasSasum(nx*ny, matrix, 1);
+	float sum;
+	cublasSasum(starpu_cublas_get_local_handle(), nx*ny, matrix, 1, &sum);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	sum /= nx*ny;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
@@ -218,12 +221,7 @@ int main(int argc, char **argv)
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-#ifdef STARPU_USE_CUDA
-	/* cublasSasum has synchronization issues when using a non-blocking stream */
-	cublasGetVersion(&cublas_version);
-	if (cublas_version >= 7050)
-		starpu_cublas_init();
-#endif
+	starpu_cublas_init();
 
 	devices = starpu_cpu_worker_get_count();
 	if (devices)
@@ -246,11 +244,8 @@ int main(int argc, char **argv)
 
 error:
 	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
-#ifdef STARPU_USE_CUDA
-	if (cublas_version >= 7050)
-		starpu_cublas_shutdown();
-#endif
 
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 	STARPU_RETURN(ret);
 }