8 年之前 · 441644b632
--- a/ChangeLog
+++ b/ChangeLog
@@ -60,6 +60,7 @@ New features:
 
																   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
															
 
																     models between devices, making calibration much faster.
															
 
																   * Add modular-heft-prio scheduler.
															
 
																+  * Add starpu_cublas_get_local_handle helper.
															
 
																 Changes:
															
 
																   * Fix performance regression of lws for small tasks.
															
--- a/Makefile.am
+++ b/Makefile.am
@@ -98,6 +98,7 @@ versinclude_HEADERS = 				\
 
																 	include/starpu_rand.h			\
															
 
																 	include/starpu_disk.h			\
															
 
																 	include/starpu_cublas.h			\
															
 
																+	include/starpu_cublas_v2.h		\
															
 
																 	include/starpu_driver.h			\
															
 
																 	include/starpu_stdlib.h			\
															
 
																 	include/starpu_thread.h			\
															
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -62,10 +62,11 @@ Unfortunately, some CUDA libraries do not have stream variants of
 
																 kernels. That will lower the potential for overlapping.
															
 
																 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
															
 
																-CUBLAS library. Some libraries like Magma may however change the current stream,
															
 
																+CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
															
 
																 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
															
 
																 the beginning of the codelet to make sure that CUBLAS is really using the proper
															
 
																-stream.
															
 
																+stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
															
 
																+kernels with the proper configuration.
															
 
																 If the kernel can be made to only use this local stream or other self-allocated
															
 
																 streams, i.e. the whole kernel submission can be made asynchronous, then
															
--- a/doc/doxygen/chapters/api/cuda_extensions.doxy
+++ b/doc/doxygen/chapters/api/cuda_extensions.doxy
@@ -69,10 +69,16 @@ initialized on every device.
 
																 \fn void starpu_cublas_set_stream(void)
															
 
																 \ingroup API_CUDA_Extensions
															
 
																-This function sets the proper cublas stream. This must be called from the CUDA
															
 
																-codelet before calling cublas kernels, so that they are queued on the proper
															
 
																+This function sets the proper CUBLAS stream for CUBLAS v1. This must be called from the CUDA
															
 
																+codelet before calling CUBLAS v1 kernels, so that they are queued on the proper
															
 
																 CUDA stream. When using one thread per CUDA worker, this function does not
															
 
																-do anything since the cublas stream does not change, and is set once by
															
 
																+do anything since the CUBLAS stream does not change, and is set once by
															
 
																+starpu_cublas_init().
															
 
																+
															
 
																+\fn cublasHandle_t starpu_cublas_get_local_handle(void)
															
 
																+\ingroup API_CUDA_Extensions
															
 
																+This function returns the CUBLAS v2 handle to be used to queue CUBLAS v2
															
 
																+kernels. It is properly initialized and configured for multistream by
															
 
																 starpu_cublas_init().
															
 
																 \fn void starpu_cublas_shutdown(void)
															
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -32,6 +32,7 @@
 
																 #include <fftw3.h>
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cufft.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																 #endif
															
 
																 /* #define SAVE_RAW	1 */
															
@@ -198,7 +199,6 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
																 	localout = plans[workerid].localout;
															
 
																-	starpu_cublas_set_stream();
															
 
																 	/* FFT */
															
 
																 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
@@ -216,7 +216,10 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 	/* FFTW does not normalize its output ! */
															
 
																-	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
															
 
																+	float scal = 1.0f/nsamples;
															
 
																+	cublasStatus_t status = cublasSscal (starpu_cublas_local_handle(), nsamples, &scal, localA, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -30,7 +30,7 @@
 
																 #include <common/blas.h>
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																 #endif
															
 
																 #include "axpy.h"
															
@@ -74,8 +74,9 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
																 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
															
 
																+	cublasStatus_t status = CUBLASAXPY(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -21,7 +21,6 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																-#include <cublas.h>
															
 
																 #endif
															
 
																 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -22,6 +22,12 @@
 
																 #include <math.h>
															
 
																 #include <limits.h>
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <starpu_cublas_v2.h>
															
 
																+static const TYPE p1 = 1.0;
															
 
																+static const TYPE m1 = -1.0;
															
 
																+#endif
															
 
																+
															
 
																 #if 0
															
 
																 static void print_vector_from_descr(unsigned nx, TYPE *v)
															
 
																 {
															
@@ -81,8 +87,9 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 
																 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
															
 
																+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), 1, &p1, v_src, 1, v_dst, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
@@ -120,9 +127,10 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 
																 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																- 
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
															
 
																+
															
 
																+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p1, v_src, 1, v_dst, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
@@ -249,10 +257,26 @@ static void dot_kernel_cuda(void *descr[], void *cl_arg)
 
																 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
															
 
																-	/* Contrary to cublasSdot, this function puts its result directly in
															
 
																-	 * device memory, so that we don't have to transfer that value back and
															
 
																-	 * forth. */
															
 
																-	dot_host(v1, v2, n, dot);
															
 
																+	int version;
															
 
																+	cublasGetVersion(starpu_cublas_get_local_handle(), &version);
															
 
																+
															
 
																+	/* FIXME: check in Nvidia bug #1882017 when this gets fixed */
															
 
																+	if (version < 99999)
															
 
																+	{
															
 
																+		/* This function puts its result directly in device memory, so
															
 
																+		 * that we don't have to transfer that value back and forth. */
															
 
																+		dot_host(v1, v2, n, dot);
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* Should be able to put result in GPU, but does not yet, see
															
 
																+		 * Nvidia bug #1882017 */
															
 
																+		cublasStatus_t status = cublasdot(starpu_cublas_get_local_handle(),
															
 
																+			n, v1, 1, v2, 1, dot);
															
 
																+		if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+			STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+	}
															
 
																 }
															
 
																 #endif
															
@@ -337,8 +361,9 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 
																 	/* v1 = p1 v1 */
															
 
																 	TYPE alpha = p1;
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasscal(n, alpha, v1, 1);
															
 
																+	cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
@@ -392,8 +417,10 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
																 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
															
 
																 	/* Compute v1 = alpha M v2 + beta v1 */
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
															
 
																+	cublasStatus_t status = cublasgemv(starpu_cublas_get_local_handle(),
															
 
																+			CUBLAS_OP_N, nx, ny, &alpha, M, ld, v2, 1, &beta, v1, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
@@ -508,9 +535,13 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 
																 	 *	v1 = p1 v1
															
 
																 	 *	v1 = v1 + p2 v2
															
 
																 	 */
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasscal(n, p1, v1, 1);
															
 
																-	cublasaxpy(n, p2, v2, 1, v1, 1);
															
 
																+	cublasStatus_t status;
															
 
																+	status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+	status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
@@ -589,8 +620,10 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 
																 	/* Compute v1 = v1 + p1 * v2.
															
 
																 	 */
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasaxpy(n, p1, v2, 1, v1, 1);
															
 
																+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
															
 
																+			n, &p1, v2, 1, v1, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -24,7 +24,6 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																 #include <cuda_runtime.h>
															
 
																-#include <cublas.h>
															
 
																 #endif
															
 
																 #include <common/blas.h>
															
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -25,15 +25,24 @@
 
																 #include <starpu.h>
															
 
																 #include "cholesky.h"
															
 
																 #include "../common/blas.h"
															
 
																-#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
															
 
																+#if defined(STARPU_USE_CUDA)
															
 
																+#include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																+#if defined(STARPU_HAVE_MAGMA)
															
 
																 #include "magma.h"
															
 
																 #include "magma_lapack.h"
															
 
																 #endif
															
 
																+#endif
															
 
																 /*
															
 
																  *   U22 
															
 
																  */
															
 
																+#if defined(STARPU_USE_CUDA)
															
 
																+static const float p1 =  1.0;
															
 
																+static const float m1 = -1.0;
															
 
																+#endif
															
 
																+
															
 
																 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																 {
															
 
																 	/* printf("22\n"); */
															
@@ -78,14 +87,12 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
																 	{
															
 
																 		/* CUDA kernel */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#ifdef STARPU_HAVE_MAGMA
															
 
																-		cublasSetKernelStream(starpu_cuda_get_local_stream());
															
 
																-#else
															
 
																-		starpu_cublas_set_stream();
															
 
																-#endif
															
 
																-		cublasSgemm('n', 't', dy, dx, dz, 
															
 
																-				-1.0f, left, ld21, right, ld12, 
															
 
																-				 1.0f, center, ld22);
															
 
																+		cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
															
 
																+				CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz, 
															
 
																+				&m1, left, ld21, right, ld12, 
															
 
																+				&p1, center, ld22);
															
 
																+		if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+			STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 #endif
															
 
																 	}
															
@@ -122,6 +129,10 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
																 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
															
 
																 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus status;
															
 
																+#endif
															
 
																+
															
 
																 	switch (s)
															
 
																 	{
															
 
																 		case 0:
															
@@ -129,12 +140,11 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-#ifdef STARPU_HAVE_MAGMA
															
 
																-			cublasSetKernelStream(starpu_cuda_get_local_stream());
															
 
																-#else
															
 
																-			starpu_cublas_set_stream();
															
 
																-#endif
															
 
																-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
															
 
																+			status = cublasStrsm(starpu_cublas_get_local_handle(),
															
 
																+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
															
 
																+					nx21, ny21, &p1, sub11, ld11, sub21, ld21);
															
 
																+			if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+				STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 			break;
															
 
																 #endif
															
 
																 		default:
															
@@ -206,9 +216,10 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
																 			{
															
 
																 			int ret;
															
 
																 			int info;
															
 
																+			cudaStream_t stream = starpu_cuda_get_local_stream();
															
 
																 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
															
 
																-			cublasSetKernelStream(starpu_cuda_get_local_stream());
															
 
																-			magmablasSetKernelStream(starpu_cuda_get_local_stream());
															
 
																+			cublasSetKernelStream(stream);
															
 
																+			magmablasSetKernelStream(stream);
															
 
																 #else
															
 
																 			starpu_cublas_set_stream();
															
 
																 #endif
															
@@ -219,7 +230,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
																 				STARPU_ABORT();
															
 
																 			}
															
 
																 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
															
 
																-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+			cudaError_t cures = cudaStreamSynchronize(stream);
															
 
																 #else
															
 
																 			cudaError_t cures = cudaThreadSynchronize();
															
 
																 #endif
															
@@ -229,29 +240,36 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
																 			{
															
 
																 			float *lambda11;
															
 
																+			cublasStatus_t status;
															
 
																+			cudaStream_t stream = starpu_cuda_get_local_stream();
															
 
																+			cublasHandle_t handle = starpu_cublas_get_local_handle();
															
 
																 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
															
 
																 			for (z = 0; z < nx; z++)
															
 
																 			{
															
 
																-				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
															
 
																+				cudaStreamSynchronize(stream);
															
 
																 				STARPU_ASSERT(*lambda11 != 0.0f);
															
 
																 				*lambda11 = sqrt(*lambda11);
															
 
																 /*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
															
 
																-				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
															
 
																+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, stream);
															
 
																+				float scal = 1.0f/(*lambda11);
															
 
																-				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
															
 
																+				status = cublasSscal(handle,
															
 
																+						nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
															
 
																-				cublasSsyr('U', nx - z - 1, -1.0f,
															
 
																+				status = cublasSsyr(handle,
															
 
																+							CUBLAS_FILL_MODE_UPPER,
															
 
																+							nx - z - 1, &m1,
															
 
																 							&sub11[(z+1)+z*ld], 1,
															
 
																 							&sub11[(z+1)+(z+1)*ld], ld);
															
 
																 			}
															
 
																-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+			cudaStreamSynchronize(stream);
															
 
																 			cudaFreeHost(lambda11);
															
 
																 			}
															
 
																 #endif
															
--- a/examples/heat/dw_factolu.c
+++ b/examples/heat/dw_factolu.c
@@ -72,8 +72,8 @@ static struct starpu_codelet cl12 =
 
																 	.cpu_funcs_name = {"dw_cpu_codelet_update_u12"},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {dw_cublas_codelet_update_u12},
															
 
																-	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 2,
															
 
																 	.modes = {STARPU_R, STARPU_RW},
															
 
																 	.model = &model_12
															
@@ -85,8 +85,8 @@ static struct starpu_codelet cl21 =
 
																 	.cpu_funcs_name = {"dw_cpu_codelet_update_u21"},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {dw_cublas_codelet_update_u21},
															
 
																-	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 2,
															
 
																 	.modes = {STARPU_R, STARPU_RW},
															
 
																 	.model = &model_21
															
@@ -98,8 +98,8 @@ static struct starpu_codelet cl22 =
 
																 	.cpu_funcs_name = {"dw_cpu_codelet_update_u22"},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {dw_cublas_codelet_update_u22},
															
 
																-	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 3,
															
 
																 	.modes = {STARPU_R, STARPU_R, STARPU_RW},
															
 
																 	.model = &model_22
															
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -25,7 +25,6 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																 #include <cuda_runtime.h>
															
 
																-#include <cublas.h>
															
 
																 #endif
															
 
																 #include "../common/blas.h"
															
--- a/examples/heat/dw_factolu_grain.c
+++ b/examples/heat/dw_factolu_grain.c
@@ -99,6 +99,7 @@ static struct starpu_codelet cl12 =
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {dw_cublas_codelet_update_u12},
															
 
																 #endif
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &model_12
															
 
																 };
															
@@ -144,6 +145,7 @@ static struct starpu_codelet cl21 =
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {dw_cublas_codelet_update_u21},
															
 
																 #endif
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &model_21
															
 
																 };
															
@@ -186,6 +188,7 @@ static struct starpu_codelet cl22 =
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {dw_cublas_codelet_update_u22},
															
 
																 #endif
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 3,
															
 
																 	.model = &model_22
															
 
																 };
															
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -20,6 +20,13 @@
 
																  */
															
 
																 #include "dw_factolu.h"
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																+static const float p1 =  1.0;
															
 
																+static const float m1 = -1.0;
															
 
																+#endif
															
 
																+
															
 
																 unsigned count_11_per_worker[STARPU_NMAXWORKERS] = {0};
															
 
																 unsigned count_12_per_worker[STARPU_NMAXWORKERS] = {0};
															
 
																 unsigned count_21_per_worker[STARPU_NMAXWORKERS] = {0};
															
@@ -134,10 +141,10 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																-			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
															
 
																-					right, ld12, 1.0f, center, ld22);
															
 
																-			status = cublasGetError();
															
 
																+			status = cublasSgemm(starpu_cublas_get_local_handle(),
															
 
																+					CUBLAS_OP_N, CUBLAS_OP_N,
															
 
																+					dx, dy, dz, &m1, left, ld21,
															
 
																+					right, ld12, &p1, center, ld22);
															
 
																 			if (status != CUBLAS_STATUS_SUCCESS)
															
 
																 				STARPU_CUBLAS_REPORT_ERROR(status);
															
@@ -198,10 +205,10 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																-			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
															
 
																-					1.0f, sub11, ld11, sub12, ld12);
															
 
																-			status = cublasGetError();
															
 
																+			status = cublasStrsm(starpu_cublas_get_local_handle(),
															
 
																+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
															
 
																+					ny12, nx12,
															
 
																+					&p1, sub11, ld11, sub12, ld12);
															
 
																 			if (status != CUBLAS_STATUS_SUCCESS)
															
 
																 				STARPU_CUBLAS_REPORT_ERROR(status);
															
@@ -260,9 +267,9 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																-			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
															
 
																-			status = cublasGetError();
															
 
																+			status = cublasStrsm(starpu_cublas_get_local_handle(),
															
 
																+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
															
 
																+					ny21, nx21, &p1, sub11, ld11, sub21, ld21);
															
 
																 			if (status != CUBLAS_STATUS_SUCCESS)
															
 
																 				STARPU_CUBLAS_REPORT_ERROR(status);
															
@@ -322,6 +329,12 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
																 	unsigned long z;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cudaStream_t stream;
															
 
																+	cublasHandle_t handle;
															
 
																+	cublasStatus_t status;
															
 
																+#endif
															
 
																+
															
 
																 	switch (s)
															
 
																 	{
															
 
																 		case 0:
															
@@ -341,24 +354,28 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																+			stream = starpu_cuda_get_local_stream();
															
 
																+			handle = starpu_cublas_get_local_handle();
															
 
																 			for (z = 0; z < nx; z++)
															
 
																 			{
															
 
																 				float pivot;
															
 
																-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
															
 
																+				cudaStreamSynchronize(stream);
															
 
																 				STARPU_ASSERT(pivot != 0.0f);
															
 
																+				float scal = 1.0f/pivot;
															
 
																-				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
															
 
																+				status = cublasSscal(starpu_cublas_get_local_handle(),
															
 
																+						nx - z - 1, &scal, &sub11[z+(z+1)*ld], ld);
															
 
																-				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
															
 
																+				status = cublasSger(starpu_cublas_get_local_handle(),
															
 
																+						nx - z - 1, nx - z - 1, &m1,
															
 
																 								&sub11[z+(z+1)*ld], ld,
															
 
																 								&sub11[(z+1)+z*ld], 1,
															
 
																 								&sub11[(z+1) + (z+1)*ld],ld);
															
 
																 			}
															
 
																-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+			cudaStreamSynchronize(stream);
															
 
																 			break;
															
 
																 #endif
															
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -241,6 +241,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
																 	struct starpu_task *task6 = create_task(maskiter | 6UL);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
															
 
																+	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
															
 
																 #endif
															
 
																 	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
															
 
																 	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
															
@@ -259,6 +260,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
																 	struct starpu_task *task7 = create_task(maskiter | 7UL);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
															
 
																+	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
															
 
																 #endif
															
 
																 	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
															
 
																 	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
															
@@ -292,6 +294,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
																 	struct starpu_task *task9 = create_task(maskiter | 9UL);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
															
 
																+	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
															
 
																 #endif
															
 
																 	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
															
 
																 	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
															
--- a/examples/heat/dw_sparse_cg.h
+++ b/examples/heat/dw_sparse_cg.h
@@ -29,10 +29,6 @@
 
																 #include <starpu.h>
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-#include <cublas.h>
															
 
																-#endif
															
 
																-
															
 
																 #include "../common/blas.h"
															
 
																 #define MAXITER	100000
															
--- a/examples/heat/dw_sparse_cg_kernels.c
+++ b/examples/heat/dw_sparse_cg_kernels.c
@@ -17,6 +17,10 @@
 
																 #include "dw_sparse_cg.h"
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <starpu_cublas_v2.h>
															
 
																+#endif
															
 
																+
															
 
																 /*
															
 
																  *	Algorithm :
															
 
																  *		
															
@@ -146,8 +150,10 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 
																 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	size = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	dot = cublasSdot (size, vec, 1, vec, 1);
															
 
																+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vec, 1, vec, 1, &dot);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	pb->delta_new = dot;
															
 
																 	pb->delta_0 = dot;
															
@@ -239,8 +245,10 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 
																 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
															
 
																 	size = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	dot = cublasSdot (size, vecd, 1, vecq, 1);
															
 
																+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecd, 1, vecq, 1, &dot);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	pb->alpha = pb->delta_new / dot;
															
 
																 }
															
@@ -283,8 +291,9 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
																 	size = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
															
 
																+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &pb->alpha, vecd, 1, vecx, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
@@ -323,8 +332,11 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
																 	size = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
															
 
																+	float scal = -pb->alpha;
															
 
																+
															
 
																+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecq, 1, vecr, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
@@ -367,8 +379,8 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 
																 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	size = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	dot = cublasSdot (size, vecr, 1, vecr, 1);
															
 
																+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	pb->delta_old = pb->delta_new;
															
 
																 	pb->delta_new = dot;
															
@@ -416,11 +428,16 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
																 	size = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	starpu_cublas_set_stream();
															
 
																 	/* d = beta d */
															
 
																-	cublasSscal(size, pb->beta, vecd, 1);
															
 
																+	cublasStatus_t status;
															
 
																+	status = cublasSscal(starpu_cublas_get_local_handle(), size, &pb->beta, vecd, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 	/* d = r + d */
															
 
																-	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
															
 
																+	float scal = 1.0f;
															
 
																+	status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecr, 1, vecd, 1);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
--- a/examples/lu/xlu.h
+++ b/examples/lu/xlu.h
@@ -20,9 +20,6 @@
 
																 #include <starpu.h>
															
 
																 #include <common/blas.h>
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-#include <cublas.h>
															
 
																-#endif
															
 
																 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
															
 
																 #define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
															
--- a/examples/lu/xlu_kernels.c
+++ b/examples/lu/xlu_kernels.c
@@ -21,6 +21,11 @@
 
																 #include <math.h>
															
 
																 #include <complex.h>
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																+#endif
															
 
																+
															
 
																 #define str(s) #s
															
 
																 #define xstr(s)        str(s)
															
 
																 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
															
@@ -65,12 +70,11 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																 		{
															
 
																-			starpu_cublas_set_stream();
															
 
																-			CUBLAS_GEMM('n', 'n', dx, dy, dz,
															
 
																-				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
															
 
																-				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
															
 
																+			status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
															
 
																+				CUBLAS_OP_N, CUBLAS_OP_N, dx, dy, dz,
															
 
																+				(CUBLAS_TYPE *)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
															
 
																+				(CUBLAS_TYPE *)&p1, (CUBLAS_TYPE *)center, ld22);
															
 
																-			status = cublasGetError();
															
 
																 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
															
 
																 				STARPU_CUBLAS_REPORT_ERROR(status);
															
@@ -186,11 +190,11 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																-			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
															
 
																-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
															
 
																+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
															
 
																+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
															
 
																+					ny12, nx12,
															
 
																+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
															
 
																-			status = cublasGetError();
															
 
																 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
															
 
																 				STARPU_CUBLAS_REPORT_ERROR(status);
															
@@ -273,11 +277,11 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																-			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
															
 
																-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
															
 
																+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
															
 
																+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
															
 
																+					ny21, nx21,
															
 
																+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
															
 
																-			status = cublasGetError();
															
 
																 			if (status != CUBLAS_STATUS_SUCCESS)
															
 
																 				STARPU_CUBLAS_REPORT_ERROR(status);
															
@@ -345,6 +349,12 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
																 	unsigned long z;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus status;
															
 
																+	cublasHandle_t handle;
															
 
																+	cudaStream_t stream;
															
 
																+#endif
															
 
																+
															
 
																 	switch (s)
															
 
																 	{
															
 
																 		case 0:
															
@@ -369,13 +379,14 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																+			handle = starpu_cublas_get_local_handle();
															
 
																+			stream = starpu_cuda_get_local_stream();
															
 
																 			for (z = 0; z < nx; z++)
															
 
																 			{
															
 
																 				TYPE pivot;
															
 
																 				TYPE inv_pivot;
															
 
																-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
															
 
																+				cudaStreamSynchronize(stream);
															
 
																 #ifdef COMPLEX_LU
															
 
																 				STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
															
@@ -385,15 +396,23 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
																 #endif
															
 
																 				inv_pivot = 1.0/pivot;
															
 
																-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
															
 
																+				status = CUBLAS_SCAL(handle,
															
 
																+						nx - z - 1,
															
 
																+						(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
															
 
																+				if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+					STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
															
 
																+				status = CUBLAS_GER(handle,
															
 
																+						nx - z - 1, nx - z - 1,
															
 
																+						(CUBLAS_TYPE*)&m1,
															
 
																 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
															
 
																 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
															
 
																 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
															
 
																+				if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+					STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 			}
															
 
																-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+			cudaStreamSynchronize(stream);
															
 
																 			break;
															
 
																 #endif
															
@@ -462,6 +481,12 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
																 	unsigned *ipiv = piv->piv;
															
 
																 	unsigned first = piv->first;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus status;
															
 
																+	cublasHandle_t handle;
															
 
																+	cudaStream_t stream;
															
 
																+#endif
															
 
																+
															
 
																 	switch (s)
															
 
																 	{
															
 
																 		case 0:
															
@@ -500,44 +525,63 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																+			handle = starpu_cublas_get_local_handle();
															
 
																+			stream = starpu_cuda_get_local_stream();
															
 
																 			for (z = 0; z < nx; z++)
															
 
																 			{
															
 
																 				TYPE pivot;
															
 
																 				TYPE inv_pivot;
															
 
																-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
															
 
																+				cudaStreamSynchronize(stream);
															
 
																 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
															
 
																 				{
															
 
																 					/* find the pivot */
															
 
																-					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
															
 
																+					int piv_ind;
															
 
																+					status = CUBLAS_IAMAX(handle,
															
 
																+						nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld, &piv_ind);
															
 
																+					piv_ind -= 1;
															
 
																+					if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+						STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 					ipiv[z + first] = piv_ind + z + first;
															
 
																 					/* swap if needed */
															
 
																 					if (piv_ind != 0)
															
 
																 					{
															
 
																-						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
															
 
																+						status = CUBLAS_SWAP(handle,
															
 
																+							nx,
															
 
																+							(CUBLAS_TYPE*)&sub11[z*ld], 1,
															
 
																+							(CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
															
 
																+						if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+							STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 					}
															
 
																-					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																-					cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
															
 
																+					cudaStreamSynchronize(stream);
															
 
																 				}
															
 
																 				STARPU_ASSERT(pivot != 0.0);
															
 
																 				inv_pivot = 1.0/pivot;
															
 
																-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
															
 
																+				status = CUBLAS_SCAL(handle,
															
 
																+						nx - z - 1,
															
 
																+						(CUBLAS_TYPE*)&inv_pivot,
															
 
																+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
															
 
																+				if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+					STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
															
 
																+				status = CUBLAS_GER(handle,
															
 
																+						nx - z - 1, nx - z - 1,
															
 
																+						(CUBLAS_TYPE*)&m1,
															
 
																 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
															
 
																 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
															
 
																 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
															
 
																-				
															
 
																+				if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+						STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 			}
															
 
																-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+			cudaStreamSynchronize(stream);
															
 
																 			break;
															
 
																 #endif
															
@@ -605,6 +649,11 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
																 	unsigned *ipiv = piv->piv;
															
 
																 	unsigned first = piv->first;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus status;
															
 
																+	cublasHandle_t handle;
															
 
																+#endif
															
 
																+
															
 
																 	switch (s)
															
 
																 	{
															
 
																 		case 0:
															
@@ -619,13 +668,18 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																+			handle = starpu_cublas_get_local_handle();
															
 
																 			for (row = 0; row < nx; row++)
															
 
																 			{
															
 
																 				unsigned rowpiv = ipiv[row+first] - first;
															
 
																 				if (rowpiv != row)
															
 
																 				{
															
 
																-					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
															
 
																+					status = CUBLAS_SWAP(handle,
															
 
																+							nx,
															
 
																+							(CUBLAS_TYPE*)&matrix[row*ld], 1,
															
 
																+							(CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
															
 
																+					if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+						STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 				}
															
 
																 			}
															
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -36,7 +36,10 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																-#include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																+static const TYPE p1 = 1.0;
															
 
																+static const TYPE m1 = -1.0;
															
 
																+static const TYPE v0 = 0.0;
															
 
																 #endif
															
 
																 static unsigned niter = 10;
															
@@ -161,9 +164,13 @@ static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
																 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
															
 
																-				     (TYPE)0.0, subC, ldC);
															
 
																+	cublasStatus_t status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
															
 
																+			CUBLAS_OP_N, CUBLAS_OP_N,
															
 
																+			nxC, nyC, nyA,
															
 
																+			&p1, subA, ldA, subB, ldB,
															
 
																+			&v0, subC, ldC);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 }
															
 
																 #endif
															
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -35,7 +35,7 @@
 
																 #include <common/blas.h>
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																 #endif
															
 
																 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
@@ -100,10 +100,9 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 
																 	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																 	int n = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																+	float alpha = 1.;
															
 
																-	starpu_cublas_set_stream();
															
 
																-	cublasSaxpy(n, 1., x, 1, y, 1);
															
 
																-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+	cublasSaxpy(starpu_cublas_get_local_handle(), n, &alpha, x, 1, y, 1);
															
 
																 }
															
 
																 #endif
															
@@ -119,6 +118,7 @@ static struct starpu_codelet pipeline_codelet_axpy =
 
																 	.cpu_funcs_name = {"pipeline_cpu_axpy"},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {pipeline_cublas_axpy},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																 	.nbuffers = 2,
															
 
																 	.modes = {STARPU_R, STARPU_RW},
															
@@ -144,10 +144,8 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 
																 	int n = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																 	float y;
															
 
																-	starpu_cublas_set_stream();
															
 
																-	y = cublasSasum(n, x, 1);
															
 
																+	cublasSasum(starpu_cublas_get_local_handle(), n, x, 1, &y);
															
 
																-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
															
 
																 }
															
 
																 #endif
															
@@ -164,6 +162,7 @@ static struct starpu_codelet pipeline_codelet_sum =
 
																 	.cpu_funcs_name = {"pipeline_cpu_sum"},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {pipeline_cublas_sum},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																 	.nbuffers = 1,
															
 
																 	.modes = {STARPU_R},
															
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -29,7 +29,7 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																-#include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																 #endif
															
 
																 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
@@ -245,7 +245,7 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 
																 void dot_cuda_func(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	DOT_TYPE current_dot;
															
 
																-	DOT_TYPE local_dot;
															
 
																+	float local_dot;
															
 
																 	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
															
@@ -256,9 +256,10 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
																 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																-	if (cublas_version >= 7050)
															
 
																-		starpu_cublas_set_stream();
															
 
																-	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
															
 
																+	cublasStatus_t status = cublasSdot(starpu_cublas_get_local_handle(), n, local_x, 1, local_y, 1, &local_dot);
															
 
																+	if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+		STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
															
 
																 	current_dot += local_dot;
															
@@ -358,14 +359,15 @@ int main(int argc, char **argv)
 
																 #endif
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	/* cublasSdot has synchronization issues when using a non-blocking stream (Nvidia bugid 1669886) */
															
 
																-	cublasGetVersion(&cublas_version);
															
 
																+	cublasHandle_t handle;
															
 
																+	cublasCreate(&handle);
															
 
																+	cublasGetVersion(handle, &cublas_version);
															
 
																+	cublasDestroy(handle);
															
 
																 	if (cublas_version >= 7050)
															
 
																 		starpu_cublas_init();
															
 
																-	if (starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1) > 1
															
 
																-	 && starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0) == 1)
															
 
																-		/* Disable the sdot cublas kernel, it is bogus with concurrent
															
 
																-		 * multistream execution (Nvidia bugid 1881192) */
															
 
																+	else
															
 
																+		/* Disable the sdot cublas kernel, it is bogus with a
															
 
																+		 * non-blocking stream (Nvidia bugid 1669886) */
															
 
																 		dot_codelet.cuda_funcs[0] = NULL;
															
 
																 #endif
															
--- a/examples/sched_ctx/gpu_partition.c
+++ b/examples/sched_ctx/gpu_partition.c
@@ -26,10 +26,6 @@
 
																 #include <common/blas.h>
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-#include <cublas.h>
															
 
																-#endif
															
 
																-
															
 
																 #define N	512*512
															
 
																 #define NITER   100
															
--- a/examples/spmv/dw_block_spmv.c
+++ b/examples/spmv/dw_block_spmv.c
@@ -167,6 +167,7 @@ struct starpu_codelet cl =
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {cublas_block_spmv},
															
 
																 #endif
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 3,
															
 
																 	.modes = {STARPU_R, STARPU_R, STARPU_RW}
															
 
																 };
															
@@ -320,6 +321,7 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 
																 	if (ret == -ENODEV)
															
 
																 		return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	starpu_cublas_init();
															
 
																 	sem_init(&sem, 0, 0U);
															
--- a/examples/spmv/dw_block_spmv.h
+++ b/examples/spmv/dw_block_spmv.h
@@ -28,10 +28,6 @@
 
																 #include <starpu.h>
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-#include <cublas.h>
															
 
																-#endif
															
 
																-
															
 
																 void cpu_block_spmv(void *descr[], void *_args);
															
 
																 #ifdef STARPU_USE_CUDA
															
--- a/examples/spmv/dw_block_spmv_kernels.c
+++ b/examples/spmv/dw_block_spmv_kernels.c
@@ -24,6 +24,12 @@
 
																  *   U22 
															
 
																  */
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <starpu_cublas_v2.h>
															
 
																+static const float p1 =  1.0;
															
 
																+static const float m1 = -1.0;
															
 
																+#endif
															
 
																+
															
 
																 static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																 {
															
 
																 	/* printf("22\n"); */
															
@@ -43,8 +49,10 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																-			starpu_cublas_set_stream();
															
 
																-			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
															
 
																+			cublasStatus_t status = cublasSgemv (starpu_cublas_get_local_handle(),
															
 
																+					CUBLAS_OP_T, dx, dy, &p1, block, ld, in, 1, &p1, out, 1);
															
 
																+			if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+				STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																 			break;
															
 
																 #endif
															
 
																 		default:
															
--- a/include/starpu_cublas_v2.h
+++ b/include/starpu_cublas_v2.h
@@ -0,0 +1,34 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2012  Université de Bordeaux
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_CUBLAS_V2_H__
															
 
																+#define __STARPU_CUBLAS_V2_H__
															
 
																+
															
 
																+#include <cublas_v2.h>
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+extern "C"
															
 
																+{
															
 
																+#endif
															
 
																+
															
 
																+cublasHandle_t starpu_cublas_get_local_handle(void);
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#endif /* __STARPU_CUBLAS_V2_H__ */
															
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -29,7 +29,6 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																 #include <cuda_runtime.h>
															
 
																-#include <cublas.h>
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
--- a/src/drivers/cuda/starpu_cublas.c
+++ b/src/drivers/cuda/starpu_cublas.c
@@ -22,8 +22,11 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cublas.h>
															
 
																+#include <starpu_cublas_v2.h>
															
 
																 static int cublas_initialized[STARPU_NMAXWORKERS];
															
 
																+static cublasHandle_t cublas_handles[STARPU_NMAXWORKERS];
															
 
																+static cublasHandle_t main_handle;
															
 
																 static starpu_pthread_mutex_t mutex;
															
 
																 static unsigned get_idx(void) {
															
@@ -51,6 +54,9 @@ static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 
																 			STARPU_CUBLAS_REPORT_ERROR(cublasst);
															
 
																 	}
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+	cublasCreate(&cublas_handles[starpu_worker_get_id_check()]);
															
 
																+	cublasSetStream(cublas_handles[starpu_worker_get_id_check()], starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 static void set_cublas_stream_func(void *args STARPU_ATTRIBUTE_UNUSED)
															
@@ -65,6 +71,8 @@ static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 
																 	if (!--cublas_initialized[idx])
															
 
																 		cublasShutdown();
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+	cublasDestroy(cublas_handles[starpu_worker_get_id_check()]);
															
 
																 }
															
 
																 #endif
															
@@ -73,6 +81,8 @@ void starpu_cublas_init(void)
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
															
 
																 	starpu_execute_on_each_worker(set_cublas_stream_func, NULL, STARPU_CUDA);
															
 
																+
															
 
																+	cublasCreate(&main_handle);
															
 
																 #endif
															
 
																 }
															
@@ -80,6 +90,8 @@ void starpu_cublas_shutdown(void)
 
																 {
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
															
 
																+
															
 
																+	cublasDestroy(main_handle);
															
 
																 #endif
															
 
																 }
															
@@ -92,3 +104,12 @@ void starpu_cublas_set_stream(void)
 
																 		cublasSetKernelStream(starpu_cuda_get_local_stream());
															
 
																 #endif
															
 
																 }
															
 
																+
															
 
																+cublasHandle_t starpu_cublas_get_local_handle(void)
															
 
																+{
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	if (workerid >= 0)
															
 
																+		return cublas_handles[workerid];
															
 
																+	else
															
 
																+		return main_handle;
															
 
																+}
															
--- a/tests/microbenchs/matrix_as_vector.c
+++ b/tests/microbenchs/matrix_as_vector.c
@@ -18,7 +18,7 @@
 
																 #include "../helper.h"
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-#  include <cublas.h>
															
 
																+#  include <starpu_cublas_v2.h>
															
 
																 #endif
															
 
																 /*
															
@@ -55,8 +55,9 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 
																 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	starpu_cublas_set_stream();
															
 
																-	float sum = cublasSasum(nx, matrix, 1);
															
 
																+	float sum;
															
 
																+	cublasSasum(starpu_cublas_get_local_handle(), nx, matrix, 1, &sum);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	sum /= nx;
															
 
																 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
															
@@ -87,7 +88,9 @@ void matrix_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 
																 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																-	float sum = cublasSasum(nx*ny, matrix, 1);
															
 
																+	float sum;
															
 
																+	cublasSasum(starpu_cublas_get_local_handle(), nx*ny, matrix, 1, &sum);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	sum /= nx*ny;
															
 
																 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
															
@@ -218,12 +221,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	/* cublasSasum has synchronization issues when using a non-blocking stream */
															
 
																-	cublasGetVersion(&cublas_version);
															
 
																-	if (cublas_version >= 7050)
															
 
																-		starpu_cublas_init();
															
 
																-#endif
															
 
																+	starpu_cublas_init();
															
 
																 	devices = starpu_cpu_worker_get_count();
															
 
																 	if (devices)
															
@@ -246,11 +244,8 @@ int main(int argc, char **argv)
 
																 error:
															
 
																 	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	if (cublas_version >= 7050)
															
 
																-		starpu_cublas_shutdown();
															
 
																-#endif
															
 
																+	starpu_cublas_shutdown();
															
 
																 	starpu_shutdown();
															
 
																 	STARPU_RETURN(ret);
															
 
																 }