8 years ago · 441644b632
--- a/ChangeLog
+++ b/ChangeLog
@@ -60,6 +60,7 @@ New features:
 
				   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
			
 
				     models between devices, making calibration much faster.
			
 
				   * Add modular-heft-prio scheduler.
			
 
				+  * Add starpu_cublas_get_local_handle helper.
			
 
				 
			
 
				 Changes:
			
 
				   * Fix performance regression of lws for small tasks.
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -98,6 +98,7 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_rand.h			\
			
 
				 	include/starpu_disk.h			\
			
 
				 	include/starpu_cublas.h			\
			
 
				+	include/starpu_cublas_v2.h		\
			
 
				 	include/starpu_driver.h			\
			
 
				 	include/starpu_stdlib.h			\
			
 
				 	include/starpu_thread.h			\
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -62,10 +62,11 @@ Unfortunately, some CUDA libraries do not have stream variants of
 
				 kernels. That will lower the potential for overlapping.
			
 
				 
			
 
				 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
			
 
				-CUBLAS library. Some libraries like Magma may however change the current stream,
			
 
				+CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
			
 
				 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
			
 
				 the beginning of the codelet to make sure that CUBLAS is really using the proper
			
 
				-stream.
			
 
				+stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
			
 
				+kernels with the proper configuration.
			
 
				 
			
 
				 If the kernel can be made to only use this local stream or other self-allocated
			
 
				 streams, i.e. the whole kernel submission can be made asynchronous, then
			
--- a/doc/doxygen/chapters/api/cuda_extensions.doxy
+++ b/doc/doxygen/chapters/api/cuda_extensions.doxy
@@ -69,10 +69,16 @@ initialized on every device.
 
				 
			
 
				 \fn void starpu_cublas_set_stream(void)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				-This function sets the proper cublas stream. This must be called from the CUDA
			
 
				-codelet before calling cublas kernels, so that they are queued on the proper
			
 
				+This function sets the proper CUBLAS stream for CUBLAS v1. This must be called from the CUDA
			
 
				+codelet before calling CUBLAS v1 kernels, so that they are queued on the proper
			
 
				 CUDA stream. When using one thread per CUDA worker, this function does not
			
 
				-do anything since the cublas stream does not change, and is set once by
			
 
				+do anything since the CUBLAS stream does not change, and is set once by
			
 
				+starpu_cublas_init().
			
 
				+
			
 
				+\fn cublasHandle_t starpu_cublas_get_local_handle(void)
			
 
				+\ingroup API_CUDA_Extensions
			
 
				+This function returns the CUBLAS v2 handle to be used to queue CUBLAS v2
			
 
				+kernels. It is properly initialized and configured for multistream by
			
 
				 starpu_cublas_init().
			
 
				 
			
 
				 \fn void starpu_cublas_shutdown(void)
			
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -32,6 +32,7 @@
 
				 #include <fftw3.h>
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cufft.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 /* #define SAVE_RAW	1 */
			
@@ -198,7 +199,6 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
				 
			
 
				 	localout = plans[workerid].localout;
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				 	/* FFT */
			
 
				 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
			
 
				 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
@@ -216,7 +216,10 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
				 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 
			
 
				 	/* FFTW does not normalize its output ! */
			
 
				-	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
			
 
				+	float scal = 1.0f/nsamples;
			
 
				+	cublasStatus_t status = cublasSscal (starpu_cublas_local_handle(), nsamples, &scal, localA, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -30,7 +30,7 @@
 
				 #include <common/blas.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 #include "axpy.h"
			
@@ -74,8 +74,9 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
				 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
			
 
				+	cublasStatus_t status = CUBLASAXPY(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -21,7 +21,6 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				-#include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -22,6 +22,12 @@
 
				 #include <math.h>
			
 
				 #include <limits.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const TYPE p1 = 1.0;
			
 
				+static const TYPE m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 #if 0
			
 
				 static void print_vector_from_descr(unsigned nx, TYPE *v)
			
 
				 {
			
@@ -81,8 +87,9 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 
				 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				  
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
			
 
				+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), 1, &p1, v_src, 1, v_dst, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -120,9 +127,10 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 
				 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				- 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
			
 
				+
			
 
				+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p1, v_src, 1, v_dst, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -249,10 +257,26 @@ static void dot_kernel_cuda(void *descr[], void *cl_arg)
 
				 
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
			
 
				 
			
 
				-	/* Contrary to cublasSdot, this function puts its result directly in
			
 
				-	 * device memory, so that we don't have to transfer that value back and
			
 
				-	 * forth. */
			
 
				-	dot_host(v1, v2, n, dot);
			
 
				+	int version;
			
 
				+	cublasGetVersion(starpu_cublas_get_local_handle(), &version);
			
 
				+
			
 
				+	/* FIXME: check in Nvidia bug #1882017 when this gets fixed */
			
 
				+	if (version < 99999)
			
 
				+	{
			
 
				+		/* This function puts its result directly in device memory, so
			
 
				+		 * that we don't have to transfer that value back and forth. */
			
 
				+		dot_host(v1, v2, n, dot);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Should be able to put result in GPU, but does not yet, see
			
 
				+		 * Nvidia bug #1882017 */
			
 
				+		cublasStatus_t status = cublasdot(starpu_cublas_get_local_handle(),
			
 
				+			n, v1, 1, v2, 1, dot);
			
 
				+		if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+			STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	}
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -337,8 +361,9 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 
				  
			
 
				 	/* v1 = p1 v1 */
			
 
				 	TYPE alpha = p1;
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasscal(n, alpha, v1, 1);
			
 
				+	cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -392,8 +417,10 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
				 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
			
 
				 
			
 
				 	/* Compute v1 = alpha M v2 + beta v1 */
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
			
 
				+	cublasStatus_t status = cublasgemv(starpu_cublas_get_local_handle(),
			
 
				+			CUBLAS_OP_N, nx, ny, &alpha, M, ld, v2, 1, &beta, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -508,9 +535,13 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				 	 *	v1 = p1 v1
			
 
				 	 *	v1 = v1 + p2 v2
			
 
				 	 */
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasscal(n, p1, v1, 1);
			
 
				-	cublasaxpy(n, p2, v2, 1, v1, 1);
			
 
				+	cublasStatus_t status;
			
 
				+	status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -589,8 +620,10 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				  
			
 
				 	/* Compute v1 = v1 + p1 * v2.
			
 
				 	 */
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasaxpy(n, p1, v2, 1, v1, 1);
			
 
				+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
			
 
				+			n, &p1, v2, 1, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -24,7 +24,6 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
 
				-#include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				 #include <common/blas.h>
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -25,15 +25,24 @@
 
				 #include <starpu.h>
			
 
				 #include "cholesky.h"
			
 
				 #include "../common/blas.h"
			
 
				-#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
			
 
				+#if defined(STARPU_USE_CUDA)
			
 
				+#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+#if defined(STARPU_HAVE_MAGMA)
			
 
				 #include "magma.h"
			
 
				 #include "magma_lapack.h"
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  *   U22 
			
 
				  */
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA)
			
 
				+static const float p1 =  1.0;
			
 
				+static const float m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	/* printf("22\n"); */
			
@@ -78,14 +87,12 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 	{
			
 
				 		/* CUDA kernel */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifdef STARPU_HAVE_MAGMA
			
 
				-		cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				-#else
			
 
				-		starpu_cublas_set_stream();
			
 
				-#endif
			
 
				-		cublasSgemm('n', 't', dy, dx, dz, 
			
 
				-				-1.0f, left, ld21, right, ld12, 
			
 
				-				 1.0f, center, ld22);
			
 
				+		cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
			
 
				+				CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz, 
			
 
				+				&m1, left, ld21, right, ld12, 
			
 
				+				&p1, center, ld22);
			
 
				+		if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+			STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 #endif
			
 
				 
			
 
				 	}
			
@@ -122,6 +129,10 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
				 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -129,12 +140,11 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-#ifdef STARPU_HAVE_MAGMA
			
 
				-			cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				-#else
			
 
				-			starpu_cublas_set_stream();
			
 
				-#endif
			
 
				-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			status = cublasStrsm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
			
 
				+					nx21, ny21, &p1, sub11, ld11, sub21, ld21);
			
 
				+			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
@@ -206,9 +216,10 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 			{
			
 
				 			int ret;
			
 
				 			int info;
			
 
				+			cudaStream_t stream = starpu_cuda_get_local_stream();
			
 
				 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
			
 
				-			cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				-			magmablasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				+			cublasSetKernelStream(stream);
			
 
				+			magmablasSetKernelStream(stream);
			
 
				 #else
			
 
				 			starpu_cublas_set_stream();
			
 
				 #endif
			
@@ -219,7 +230,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 				STARPU_ABORT();
			
 
				 			}
			
 
				 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
			
 
				-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaError_t cures = cudaStreamSynchronize(stream);
			
 
				 #else
			
 
				 			cudaError_t cures = cudaThreadSynchronize();
			
 
				 #endif
			
@@ -229,29 +240,36 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 			{
			
 
				 
			
 
				 			float *lambda11;
			
 
				+			cublasStatus_t status;
			
 
				+			cudaStream_t stream = starpu_cuda_get_local_stream();
			
 
				+			cublasHandle_t handle = starpu_cublas_get_local_handle();
			
 
				 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
			
 
				 
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				
			
 
				-				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 				STARPU_ASSERT(*lambda11 != 0.0f);
			
 
				 				
			
 
				 				*lambda11 = sqrt(*lambda11);
			
 
				 
			
 
				 /*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
			
 
				-				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, stream);
			
 
				+				float scal = 1.0f/(*lambda11);
			
 
				 
			
 
				-				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
			
 
				+				status = cublasSscal(handle,
			
 
				+						nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
			
 
				 
			
 
				-				cublasSsyr('U', nx - z - 1, -1.0f,
			
 
				+				status = cublasSsyr(handle,
			
 
				+							CUBLAS_FILL_MODE_UPPER,
			
 
				+							nx - z - 1, &m1,
			
 
				 							&sub11[(z+1)+z*ld], 1,
			
 
				 							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				 			}
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 			cudaFreeHost(lambda11);
			
 
				 			}
			
 
				 #endif
			
--- a/examples/heat/dw_factolu.c
+++ b/examples/heat/dw_factolu.c
@@ -72,8 +72,8 @@ static struct starpu_codelet cl12 =
 
				 	.cpu_funcs_name = {"dw_cpu_codelet_update_u12"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u12},
			
 
				-	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_RW},
			
 
				 	.model = &model_12
			
@@ -85,8 +85,8 @@ static struct starpu_codelet cl21 =
 
				 	.cpu_funcs_name = {"dw_cpu_codelet_update_u21"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u21},
			
 
				-	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_RW},
			
 
				 	.model = &model_21
			
@@ -98,8 +98,8 @@ static struct starpu_codelet cl22 =
 
				 	.cpu_funcs_name = {"dw_cpu_codelet_update_u22"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u22},
			
 
				-	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 3,
			
 
				 	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				 	.model = &model_22
			
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -25,7 +25,6 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
 
				-#include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				 #include "../common/blas.h"
			
--- a/examples/heat/dw_factolu_grain.c
+++ b/examples/heat/dw_factolu_grain.c
@@ -99,6 +99,7 @@ static struct starpu_codelet cl12 =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u12},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &model_12
			
 
				 };
			
@@ -144,6 +145,7 @@ static struct starpu_codelet cl21 =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u21},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &model_21
			
 
				 };
			
@@ -186,6 +188,7 @@ static struct starpu_codelet cl22 =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u22},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &model_22
			
 
				 };
			
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -20,6 +20,13 @@
 
				  */
			
 
				 #include "dw_factolu.h"
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const float p1 =  1.0;
			
 
				+static const float m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 unsigned count_11_per_worker[STARPU_NMAXWORKERS] = {0};
			
 
				 unsigned count_12_per_worker[STARPU_NMAXWORKERS] = {0};
			
 
				 unsigned count_21_per_worker[STARPU_NMAXWORKERS] = {0};
			
@@ -134,10 +141,10 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				-			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
			
 
				-					right, ld12, 1.0f, center, ld22);
			
 
				-			status = cublasGetError();
			
 
				+			status = cublasSgemm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_OP_N, CUBLAS_OP_N,
			
 
				+					dx, dy, dz, &m1, left, ld21,
			
 
				+					right, ld12, &p1, center, ld22);
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -198,10 +205,10 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				-			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
			
 
				-					1.0f, sub11, ld11, sub12, ld12);
			
 
				-			status = cublasGetError();
			
 
				+			status = cublasStrsm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
			
 
				+					ny12, nx12,
			
 
				+					&p1, sub11, ld11, sub12, ld12);
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -260,9 +267,9 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				-			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				-			status = cublasGetError();
			
 
				+			status = cublasStrsm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
			
 
				+					ny21, nx21, &p1, sub11, ld11, sub21, ld21);
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -322,6 +329,12 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
				 
			
 
				 	unsigned long z;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cudaStream_t stream;
			
 
				+	cublasHandle_t handle;
			
 
				+	cublasStatus_t status;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -341,24 +354,28 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				float pivot;
			
 
				-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0f);
			
 
				+				float scal = 1.0f/pivot;
			
 
				 
			
 
				-				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
			
 
				+				status = cublasSscal(starpu_cublas_get_local_handle(),
			
 
				+						nx - z - 1, &scal, &sub11[z+(z+1)*ld], ld);
			
 
				 
			
 
				-				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
			
 
				+				status = cublasSger(starpu_cublas_get_local_handle(),
			
 
				+						nx - z - 1, nx - z - 1, &m1,
			
 
				 								&sub11[z+(z+1)*ld], ld,
			
 
				 								&sub11[(z+1)+z*ld], 1,
			
 
				 								&sub11[(z+1) + (z+1)*ld],ld);
			
 
				 			}
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -241,6 +241,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 	struct starpu_task *task6 = create_task(maskiter | 6UL);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
			
 
				+	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				 #endif
			
 
				 	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
			
 
				 	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
			
@@ -259,6 +260,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 	struct starpu_task *task7 = create_task(maskiter | 7UL);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
			
 
				+	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				 #endif
			
 
				 	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
			
 
				 	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
			
@@ -292,6 +294,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 	struct starpu_task *task9 = create_task(maskiter | 9UL);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
			
 
				+	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				 #endif
			
 
				 	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
			
 
				 	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
			
--- a/examples/heat/dw_sparse_cg.h
+++ b/examples/heat/dw_sparse_cg.h
@@ -29,10 +29,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				 #include "../common/blas.h"
			
 
				 
			
 
				 #define MAXITER	100000
			
--- a/examples/heat/dw_sparse_cg_kernels.c
+++ b/examples/heat/dw_sparse_cg_kernels.c
@@ -17,6 +17,10 @@
 
				 
			
 
				 #include "dw_sparse_cg.h"
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  *	Algorithm :
			
 
				  *		
			
@@ -146,8 +150,10 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 
				 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	dot = cublasSdot (size, vec, 1, vec, 1);
			
 
				+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vec, 1, vec, 1, &dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	pb->delta_new = dot;
			
 
				 	pb->delta_0 = dot;
			
@@ -239,8 +245,10 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 
				 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	dot = cublasSdot (size, vecd, 1, vecq, 1);
			
 
				+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecd, 1, vecq, 1, &dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	pb->alpha = pb->delta_new / dot;
			
 
				 }
			
@@ -283,8 +291,9 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
				 
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
			
 
				+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &pb->alpha, vecd, 1, vecx, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -323,8 +332,11 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
				 
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
			
 
				+	float scal = -pb->alpha;
			
 
				+
			
 
				+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecq, 1, vecr, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -367,8 +379,8 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	dot = cublasSdot (size, vecr, 1, vecr, 1);
			
 
				+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	pb->delta_old = pb->delta_new;
			
 
				 	pb->delta_new = dot;
			
@@ -416,11 +428,16 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
				 
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				 	/* d = beta d */
			
 
				-	cublasSscal(size, pb->beta, vecd, 1);
			
 
				+	cublasStatus_t status;
			
 
				+	status = cublasSscal(starpu_cublas_get_local_handle(), size, &pb->beta, vecd, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
 
				 	/* d = r + d */
			
 
				-	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
			
 
				+	float scal = 1.0f;
			
 
				+	status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecr, 1, vecd, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
--- a/examples/lu/xlu.h
+++ b/examples/lu/xlu.h
@@ -20,9 +20,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <common/blas.h>
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				 
			
 
				 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
			
 
				 #define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
			
--- a/examples/lu/xlu_kernels.c
+++ b/examples/lu/xlu_kernels.c
@@ -21,6 +21,11 @@
 
				 #include <math.h>
			
 
				 #include <complex.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+#endif
			
 
				+
			
 
				 #define str(s) #s
			
 
				 #define xstr(s)        str(s)
			
 
				 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
			
@@ -65,12 +70,11 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				 		{
			
 
				-			starpu_cublas_set_stream();
			
 
				-			CUBLAS_GEMM('n', 'n', dx, dy, dz,
			
 
				-				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
			
 
				-				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
			
 
				+			status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
			
 
				+				CUBLAS_OP_N, CUBLAS_OP_N, dx, dy, dz,
			
 
				+				(CUBLAS_TYPE *)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
			
 
				+				(CUBLAS_TYPE *)&p1, (CUBLAS_TYPE *)center, ld22);
			
 
				 
			
 
				-			status = cublasGetError();
			
 
				 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -186,11 +190,11 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				-			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
			
 
				-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
			
 
				+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
			
 
				+					ny12, nx12,
			
 
				+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
			
 
				 
			
 
				-			status = cublasGetError();
			
 
				 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -273,11 +277,11 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				-			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
			
 
				-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
			
 
				+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
			
 
				+					ny21, nx21,
			
 
				+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
			
 
				 
			
 
				-			status = cublasGetError();
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -345,6 +349,12 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 
			
 
				 	unsigned long z;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cublasHandle_t handle;
			
 
				+	cudaStream_t stream;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -369,13 +379,14 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				TYPE pivot;
			
 
				 				TYPE inv_pivot;
			
 
				-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 #ifdef COMPLEX_LU
			
 
				 				STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
			
@@ -385,15 +396,23 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 #endif
			
 
				 				
			
 
				 				inv_pivot = 1.0/pivot;
			
 
				-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				status = CUBLAS_SCAL(handle,
			
 
				+						nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 				
			
 
				-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
			
 
				+				status = CUBLAS_GER(handle,
			
 
				+						nx - z - 1, nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&m1,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
			
 
				 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			}
			
 
				 			
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -462,6 +481,12 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 	unsigned *ipiv = piv->piv;
			
 
				 	unsigned first = piv->first;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cublasHandle_t handle;
			
 
				+	cudaStream_t stream;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -500,44 +525,63 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				TYPE pivot;
			
 
				 				TYPE inv_pivot;
			
 
				-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
			
 
				 				{
			
 
				 					/* find the pivot */
			
 
				-					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
			
 
				+					int piv_ind;
			
 
				+					status = CUBLAS_IAMAX(handle,
			
 
				+						nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld, &piv_ind);
			
 
				+					piv_ind -= 1;
			
 
				+					if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+						STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 	
			
 
				 					ipiv[z + first] = piv_ind + z + first;
			
 
				 
			
 
				 					/* swap if needed */
			
 
				 					if (piv_ind != 0)
			
 
				 					{
			
 
				-						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
			
 
				+						status = CUBLAS_SWAP(handle,
			
 
				+							nx,
			
 
				+							(CUBLAS_TYPE*)&sub11[z*ld], 1,
			
 
				+							(CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
			
 
				+						if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+							STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 					}
			
 
				 
			
 
				-					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-					cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
			
 
				+					cudaStreamSynchronize(stream);
			
 
				 				}
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0);
			
 
				 				
			
 
				 				inv_pivot = 1.0/pivot;
			
 
				-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				status = CUBLAS_SCAL(handle,
			
 
				+						nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&inv_pivot,
			
 
				+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 				
			
 
				-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
			
 
				+				status = CUBLAS_GER(handle,
			
 
				+						nx - z - 1, nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&m1,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
			
 
				 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
			
 
				-				
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+						STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			}
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -605,6 +649,11 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
				 	unsigned *ipiv = piv->piv;
			
 
				 	unsigned first = piv->first;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cublasHandle_t handle;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -619,13 +668,18 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				 			for (row = 0; row < nx; row++)
			
 
				 			{
			
 
				 				unsigned rowpiv = ipiv[row+first] - first;
			
 
				 				if (rowpiv != row)
			
 
				 				{
			
 
				-					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
			
 
				+					status = CUBLAS_SWAP(handle,
			
 
				+							nx,
			
 
				+							(CUBLAS_TYPE*)&matrix[row*ld], 1,
			
 
				+							(CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
			
 
				+					if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+						STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 				}
			
 
				 			}
			
 
				 
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -36,7 +36,10 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const TYPE p1 = 1.0;
			
 
				+static const TYPE m1 = -1.0;
			
 
				+static const TYPE v0 = 0.0;
			
 
				 #endif
			
 
				 
			
 
				 static unsigned niter = 10;
			
@@ -161,9 +164,13 @@ static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
				 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
			
 
				-				     (TYPE)0.0, subC, ldC);
			
 
				+	cublasStatus_t status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
			
 
				+			CUBLAS_OP_N, CUBLAS_OP_N,
			
 
				+			nxC, nyC, nyA,
			
 
				+			&p1, subA, ldA, subB, ldB,
			
 
				+			&v0, subC, ldC);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -35,7 +35,7 @@
 
				 #include <common/blas.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
@@ -100,10 +100,9 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 
				 	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	float alpha = 1.;
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	cublasSaxpy(n, 1., x, 1, y, 1);
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	cublasSaxpy(starpu_cublas_get_local_handle(), n, &alpha, x, 1, y, 1);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -119,6 +118,7 @@ static struct starpu_codelet pipeline_codelet_axpy =
 
				 	.cpu_funcs_name = {"pipeline_cpu_axpy"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {pipeline_cublas_axpy},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_RW},
			
@@ -144,10 +144,8 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	float y;
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	y = cublasSasum(n, x, 1);
			
 
				+	cublasSasum(starpu_cublas_get_local_handle(), n, x, 1, &y);
			
 
				 
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
			
 
				 }
			
 
				 #endif
			
@@ -164,6 +162,7 @@ static struct starpu_codelet pipeline_codelet_sum =
 
				 	.cpu_funcs_name = {"pipeline_cpu_sum"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {pipeline_cublas_sum},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_R},
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -29,7 +29,7 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
@@ -245,7 +245,7 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 
				 void dot_cuda_func(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	DOT_TYPE current_dot;
			
 
				-	DOT_TYPE local_dot;
			
 
				+	float local_dot;
			
 
				 
			
 
				 	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -256,9 +256,10 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
				 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				-	if (cublas_version >= 7050)
			
 
				-		starpu_cublas_set_stream();
			
 
				-	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
			
 
				+	cublasStatus_t status = cublasSdot(starpu_cublas_get_local_handle(), n, local_x, 1, local_y, 1, &local_dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
			
 
				 	current_dot += local_dot;
			
@@ -358,14 +359,15 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	/* cublasSdot has synchronization issues when using a non-blocking stream (Nvidia bugid 1669886) */
			
 
				-	cublasGetVersion(&cublas_version);
			
 
				+	cublasHandle_t handle;
			
 
				+	cublasCreate(&handle);
			
 
				+	cublasGetVersion(handle, &cublas_version);
			
 
				+	cublasDestroy(handle);
			
 
				 	if (cublas_version >= 7050)
			
 
				 		starpu_cublas_init();
			
 
				-	if (starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1) > 1
			
 
				-	 && starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0) == 1)
			
 
				-		/* Disable the sdot cublas kernel, it is bogus with concurrent
			
 
				-		 * multistream execution (Nvidia bugid 1881192) */
			
 
				+	else
			
 
				+		/* Disable the sdot cublas kernel, it is bogus with a
			
 
				+		 * non-blocking stream (Nvidia bugid 1669886) */
			
 
				 		dot_codelet.cuda_funcs[0] = NULL;
			
 
				 #endif
			
 
				 
			
--- a/examples/sched_ctx/gpu_partition.c
+++ b/examples/sched_ctx/gpu_partition.c
@@ -26,10 +26,6 @@
 
				 
			
 
				 #include <common/blas.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				 
			
 
				 #define N	512*512
			
 
				 #define NITER   100
			
--- a/examples/spmv/dw_block_spmv.c
+++ b/examples/spmv/dw_block_spmv.c
@@ -167,6 +167,7 @@ struct starpu_codelet cl =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {cublas_block_spmv},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 3,
			
 
				 	.modes = {STARPU_R, STARPU_R, STARPU_RW}
			
 
				 };
			
@@ -320,6 +321,7 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	sem_init(&sem, 0, 0U);
			
 
				 
			
--- a/examples/spmv/dw_block_spmv.h
+++ b/examples/spmv/dw_block_spmv.h
@@ -28,10 +28,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				 void cpu_block_spmv(void *descr[], void *_args);
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/examples/spmv/dw_block_spmv_kernels.c
+++ b/examples/spmv/dw_block_spmv_kernels.c
@@ -24,6 +24,12 @@
 
				  *   U22 
			
 
				  */
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const float p1 =  1.0;
			
 
				+static const float m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	/* printf("22\n"); */
			
@@ -43,8 +49,10 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			starpu_cublas_set_stream();
			
 
				-			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			
 
				+			cublasStatus_t status = cublasSgemv (starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_OP_T, dx, dy, &p1, block, ld, in, 1, &p1, out, 1);
			
 
				+			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
--- a/include/starpu_cublas_v2.h
+++ b/include/starpu_cublas_v2.h
@@ -0,0 +1,34 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_CUBLAS_V2_H__
			
 
				+#define __STARPU_CUBLAS_V2_H__
			
 
				+
			
 
				+#include <cublas_v2.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+cublasHandle_t starpu_cublas_get_local_handle(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_CUBLAS_V2_H__ */
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -29,7 +29,6 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
 
				-#include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/src/drivers/cuda/starpu_cublas.c
+++ b/src/drivers/cuda/starpu_cublas.c
@@ -22,8 +22,11 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 
			
 
				 static int cublas_initialized[STARPU_NMAXWORKERS];
			
 
				+static cublasHandle_t cublas_handles[STARPU_NMAXWORKERS];
			
 
				+static cublasHandle_t main_handle;
			
 
				 static starpu_pthread_mutex_t mutex;
			
 
				 
			
 
				 static unsigned get_idx(void) {
			
@@ -51,6 +54,9 @@ static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 
				 			STARPU_CUBLAS_REPORT_ERROR(cublasst);
			
 
				 	}
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+
			
 
				+	cublasCreate(&cublas_handles[starpu_worker_get_id_check()]);
			
 
				+	cublasSetStream(cublas_handles[starpu_worker_get_id_check()], starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 static void set_cublas_stream_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
@@ -65,6 +71,8 @@ static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 
				 	if (!--cublas_initialized[idx])
			
 
				 		cublasShutdown();
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+
			
 
				+	cublasDestroy(cublas_handles[starpu_worker_get_id_check()]);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -73,6 +81,8 @@ void starpu_cublas_init(void)
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
			
 
				 	starpu_execute_on_each_worker(set_cublas_stream_func, NULL, STARPU_CUDA);
			
 
				+
			
 
				+	cublasCreate(&main_handle);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -80,6 +90,8 @@ void starpu_cublas_shutdown(void)
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
			
 
				+
			
 
				+	cublasDestroy(main_handle);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -92,3 +104,12 @@ void starpu_cublas_set_stream(void)
 
				 		cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				 #endif
			
 
				 }
			
 
				+
			
 
				+cublasHandle_t starpu_cublas_get_local_handle(void)
			
 
				+{
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	if (workerid >= 0)
			
 
				+		return cublas_handles[workerid];
			
 
				+	else
			
 
				+		return main_handle;
			
 
				+}
			
--- a/tests/microbenchs/matrix_as_vector.c
+++ b/tests/microbenchs/matrix_as_vector.c
@@ -18,7 +18,7 @@
 
				 #include "../helper.h"
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#  include <cublas.h>
			
 
				+#  include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 /*
			
@@ -55,8 +55,9 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 
				 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	starpu_cublas_set_stream();
			
 
				-	float sum = cublasSasum(nx, matrix, 1);
			
 
				+	float sum;
			
 
				+	cublasSasum(starpu_cublas_get_local_handle(), nx, matrix, 1, &sum);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	sum /= nx;
			
 
				 
			
 
				 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
@@ -87,7 +88,9 @@ void matrix_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 
				 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				 
			
 
				-	float sum = cublasSasum(nx*ny, matrix, 1);
			
 
				+	float sum;
			
 
				+	cublasSasum(starpu_cublas_get_local_handle(), nx*ny, matrix, 1, &sum);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	sum /= nx*ny;
			
 
				 
			
 
				 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
@@ -218,12 +221,7 @@ int main(int argc, char **argv)
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	/* cublasSasum has synchronization issues when using a non-blocking stream */
			
 
				-	cublasGetVersion(&cublas_version);
			
 
				-	if (cublas_version >= 7050)
			
 
				-		starpu_cublas_init();
			
 
				-#endif
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	devices = starpu_cpu_worker_get_count();
			
 
				 	if (devices)
			
@@ -246,11 +244,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 error:
			
 
				 	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	if (cublas_version >= 7050)
			
 
				-		starpu_cublas_shutdown();
			
 
				-#endif
			
 
				 
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 	STARPU_RETURN(ret);
			
 
				 }