Bläddra i källkod

Benefit from cublas v2

Samuel Thibault 8 år sedan
förälder
incheckning
441644b632

+ 1 - 0
ChangeLog

@@ -60,6 +60,7 @@ New features:
   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
     models between devices, making calibration much faster.
     models between devices, making calibration much faster.
   * Add modular-heft-prio scheduler.
   * Add modular-heft-prio scheduler.
+  * Add starpu_cublas_get_local_handle helper.
 
 
 Changes:
 Changes:
   * Fix performance regression of lws for small tasks.
   * Fix performance regression of lws for small tasks.

+ 1 - 0
Makefile.am

@@ -98,6 +98,7 @@ versinclude_HEADERS = 				\
 	include/starpu_rand.h			\
 	include/starpu_rand.h			\
 	include/starpu_disk.h			\
 	include/starpu_disk.h			\
 	include/starpu_cublas.h			\
 	include/starpu_cublas.h			\
+	include/starpu_cublas_v2.h		\
 	include/starpu_driver.h			\
 	include/starpu_driver.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\
 	include/starpu_thread.h			\

+ 3 - 2
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -62,10 +62,11 @@ Unfortunately, some CUDA libraries do not have stream variants of
 kernels. That will lower the potential for overlapping.
 kernels. That will lower the potential for overlapping.
 
 
 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
-CUBLAS library. Some libraries like Magma may however change the current stream,
+CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
 the beginning of the codelet to make sure that CUBLAS is really using the proper
-stream.
+stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
+kernels with the proper configuration.
 
 
 If the kernel can be made to only use this local stream or other self-allocated
 If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then
 streams, i.e. the whole kernel submission can be made asynchronous, then

+ 9 - 3
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -69,10 +69,16 @@ initialized on every device.
 
 
 \fn void starpu_cublas_set_stream(void)
 \fn void starpu_cublas_set_stream(void)
 \ingroup API_CUDA_Extensions
 \ingroup API_CUDA_Extensions
-This function sets the proper cublas stream. This must be called from the CUDA
+This function sets the proper CUBLAS stream for CUBLAS v1. This must be called from the CUDA
-codelet before calling cublas kernels, so that they are queued on the proper
+codelet before calling CUBLAS v1 kernels, so that they are queued on the proper
 CUDA stream. When using one thread per CUDA worker, this function does not
 CUDA stream. When using one thread per CUDA worker, this function does not
-do anything since the cublas stream does not change, and is set once by
+do anything since the CUBLAS stream does not change, and is set once by
+starpu_cublas_init().
+
+\fn cublasHandle_t starpu_cublas_get_local_handle(void)
+\ingroup API_CUDA_Extensions
+This function returns the CUBLAS v2 handle to be used to queue CUBLAS v2
+kernels. It is properly initialized and configured for multistream by
 starpu_cublas_init().
 starpu_cublas_init().
 
 
 \fn void starpu_cublas_shutdown(void)
 \fn void starpu_cublas_shutdown(void)

+ 5 - 2
examples/audio/starpu_audio_processing.c

@@ -32,6 +32,7 @@
 #include <fftw3.h>
 #include <fftw3.h>
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cufft.h>
 #include <cufft.h>
+#include <starpu_cublas_v2.h>
 #endif
 #endif
 
 
 /* #define SAVE_RAW	1 */
 /* #define SAVE_RAW	1 */
@@ -198,7 +199,6 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 
 	localout = plans[workerid].localout;
 	localout = plans[workerid].localout;
 
 
-	starpu_cublas_set_stream();
 	/* FFT */
 	/* FFT */
 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
 	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
@@ -216,7 +216,10 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
 
 	/* FFTW does not normalize its output ! */
 	/* FFTW does not normalize its output ! */
-	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
+	float scal = 1.0f/nsamples;
+	cublasStatus_t status = cublasSscal (starpu_cublas_local_handle(), nsamples, &scal, localA, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 

+ 4 - 3
examples/axpy/axpy.c

@@ -30,7 +30,7 @@
 #include <common/blas.h>
 #include <common/blas.h>
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
 #endif
 #endif
 
 
 #include "axpy.h"
 #include "axpy.h"
@@ -74,8 +74,9 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 
-	starpu_cublas_set_stream();
+	cublasStatus_t status = CUBLASAXPY(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
-	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 

+ 0 - 1
examples/cg/cg.c

@@ -21,7 +21,6 @@
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
-#include <cublas.h>
 #endif
 #endif
 
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

+ 51 - 18
examples/cg/cg_kernels.c

@@ -22,6 +22,12 @@
 #include <math.h>
 #include <math.h>
 #include <limits.h>
 #include <limits.h>
 
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cublas_v2.h>
+static const TYPE p1 = 1.0;
+static const TYPE m1 = -1.0;
+#endif
+
 #if 0
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 {
 {
@@ -81,8 +87,9 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
  
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), 1, &p1, v_src, 1, v_dst, 1);
-	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 
@@ -120,9 +127,10 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
- 
+
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p1, v_src, 1, v_dst, 1);
-	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 
@@ -249,10 +257,26 @@ static void dot_kernel_cuda(void *descr[], void *cl_arg)
 
 
 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
 
 
-	/* Contrary to cublasSdot, this function puts its result directly in
+	int version;
-	 * device memory, so that we don't have to transfer that value back and
+	cublasGetVersion(starpu_cublas_get_local_handle(), &version);
-	 * forth. */
+
-	dot_host(v1, v2, n, dot);
+	/* FIXME: check in Nvidia bug #1882017 when this gets fixed */
+	if (version < 99999)
+	{
+		/* This function puts its result directly in device memory, so
+		 * that we don't have to transfer that value back and forth. */
+		dot_host(v1, v2, n, dot);
+	}
+	else
+	{
+		/* Should be able to put result in GPU, but does not yet, see
+		 * Nvidia bug #1882017 */
+		cublasStatus_t status = cublasdot(starpu_cublas_get_local_handle(),
+			n, v1, 1, v2, 1, dot);
+		if (status != CUBLAS_STATUS_SUCCESS)
+			STARPU_CUBLAS_REPORT_ERROR(status);
+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	}
 }
 }
 #endif
 #endif
 
 
@@ -337,8 +361,9 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
  
  
 	/* v1 = p1 v1 */
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
 	TYPE alpha = p1;
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
-	cublasscal(n, alpha, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 
@@ -392,8 +417,10 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	/* Compute v1 = alpha M v2 + beta v1 */
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasgemv(starpu_cublas_get_local_handle(),
-	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
+			CUBLAS_OP_N, nx, ny, &alpha, M, ld, v2, 1, &beta, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 
@@ -508,9 +535,13 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 *	v1 = p1 v1
 	 *	v1 = p1 v1
 	 *	v1 = v1 + p2 v2
 	 *	v1 = v1 + p2 v2
 	 */
 	 */
-	starpu_cublas_set_stream();
+	cublasStatus_t status;
-	cublasscal(n, p1, v1, 1);
+	status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
-	cublasaxpy(n, p2, v2, 1, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 
@@ -589,8 +620,10 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  
  
 	/* Compute v1 = v1 + p1 * v2.
 	/* Compute v1 = v1 + p1 * v2.
 	 */
 	 */
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
-	cublasaxpy(n, p1, v2, 1, v1, 1);
+			n, &p1, v2, 1, v1, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 

+ 0 - 1
examples/cholesky/cholesky.h

@@ -24,7 +24,6 @@
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime.h>
-#include <cublas.h>
 #endif
 #endif
 
 
 #include <common/blas.h>
 #include <common/blas.h>

+ 42 - 24
examples/cholesky/cholesky_kernels.c

@@ -25,15 +25,24 @@
 #include <starpu.h>
 #include <starpu.h>
 #include "cholesky.h"
 #include "cholesky.h"
 #include "../common/blas.h"
 #include "../common/blas.h"
-#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
+#if defined(STARPU_USE_CUDA)
+#include <cublas.h>
+#include <starpu_cublas_v2.h>
+#if defined(STARPU_HAVE_MAGMA)
 #include "magma.h"
 #include "magma.h"
 #include "magma_lapack.h"
 #include "magma_lapack.h"
 #endif
 #endif
+#endif
 
 
 /*
 /*
  *   U22 
  *   U22 
  */
  */
 
 
+#if defined(STARPU_USE_CUDA)
+static const float p1 =  1.0;
+static const float m1 = -1.0;
+#endif
+
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 {
 	/* printf("22\n"); */
 	/* printf("22\n"); */
@@ -78,14 +87,12 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 	{
 	{
 		/* CUDA kernel */
 		/* CUDA kernel */
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#ifdef STARPU_HAVE_MAGMA
+		cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
-		cublasSetKernelStream(starpu_cuda_get_local_stream());
+				CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz, 
-#else
+				&m1, left, ld21, right, ld12, 
-		starpu_cublas_set_stream();
+				&p1, center, ld22);
-#endif
+		if (status != CUBLAS_STATUS_SUCCESS)
-		cublasSgemm('n', 't', dy, dx, dz, 
+			STARPU_CUBLAS_REPORT_ERROR(status);
-				-1.0f, left, ld21, right, ld12, 
-				 1.0f, center, ld22);
 #endif
 #endif
 
 
 	}
 	}
@@ -122,6 +129,10 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
 
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+#endif
+
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
@@ -129,12 +140,11 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-#ifdef STARPU_HAVE_MAGMA
+			status = cublasStrsm(starpu_cublas_get_local_handle(),
-			cublasSetKernelStream(starpu_cuda_get_local_stream());
+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
-#else
+					nx21, ny21, &p1, sub11, ld11, sub21, ld21);
-			starpu_cublas_set_stream();
+			if (status != CUBLAS_STATUS_SUCCESS)
-#endif
+				STARPU_CUBLAS_REPORT_ERROR(status);
-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
 			break;
 #endif
 #endif
 		default:
 		default:
@@ -206,9 +216,10 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 			{
 			{
 			int ret;
 			int ret;
 			int info;
 			int info;
+			cudaStream_t stream = starpu_cuda_get_local_stream();
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
-			cublasSetKernelStream(starpu_cuda_get_local_stream());
+			cublasSetKernelStream(stream);
-			magmablasSetKernelStream(starpu_cuda_get_local_stream());
+			magmablasSetKernelStream(stream);
 #else
 #else
 			starpu_cublas_set_stream();
 			starpu_cublas_set_stream();
 #endif
 #endif
@@ -219,7 +230,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 				STARPU_ABORT();
 				STARPU_ABORT();
 			}
 			}
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaError_t cures = cudaStreamSynchronize(stream);
 #else
 #else
 			cudaError_t cures = cudaThreadSynchronize();
 			cudaError_t cures = cudaThreadSynchronize();
 #endif
 #endif
@@ -229,29 +240,36 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 			{
 			{
 
 
 			float *lambda11;
 			float *lambda11;
+			cublasStatus_t status;
+			cudaStream_t stream = starpu_cuda_get_local_stream();
+			cublasHandle_t handle = starpu_cublas_get_local_handle();
 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
 
 
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				
 				
-				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(stream);
 
 
 				STARPU_ASSERT(*lambda11 != 0.0f);
 				STARPU_ASSERT(*lambda11 != 0.0f);
 				
 				
 				*lambda11 = sqrt(*lambda11);
 				*lambda11 = sqrt(*lambda11);
 
 
 /*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
 /*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
-				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, stream);
+				float scal = 1.0f/(*lambda11);
 
 
-				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
+				status = cublasSscal(handle,
+						nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
 
 
-				cublasSsyr('U', nx - z - 1, -1.0f,
+				status = cublasSsyr(handle,
+							CUBLAS_FILL_MODE_UPPER,
+							nx - z - 1, &m1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 			}
 
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 			cudaFreeHost(lambda11);
 			cudaFreeHost(lambda11);
 			}
 			}
 #endif
 #endif

+ 3 - 3
examples/heat/dw_factolu.c

@@ -72,8 +72,8 @@ static struct starpu_codelet cl12 =
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u12"},
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u12"},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12},
 	.cuda_funcs = {dw_cublas_codelet_update_u12},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
 	.modes = {STARPU_R, STARPU_RW},
 	.model = &model_12
 	.model = &model_12
@@ -85,8 +85,8 @@ static struct starpu_codelet cl21 =
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u21"},
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u21"},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21},
 	.cuda_funcs = {dw_cublas_codelet_update_u21},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
 	.modes = {STARPU_R, STARPU_RW},
 	.model = &model_21
 	.model = &model_21
@@ -98,8 +98,8 @@ static struct starpu_codelet cl22 =
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u22"},
 	.cpu_funcs_name = {"dw_cpu_codelet_update_u22"},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22},
 	.cuda_funcs = {dw_cublas_codelet_update_u22},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &model_22
 	.model = &model_22

+ 0 - 1
examples/heat/dw_factolu.h

@@ -25,7 +25,6 @@
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime.h>
-#include <cublas.h>
 #endif
 #endif
 
 
 #include "../common/blas.h"
 #include "../common/blas.h"

+ 3 - 0
examples/heat/dw_factolu_grain.c

@@ -99,6 +99,7 @@ static struct starpu_codelet cl12 =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12},
 	.cuda_funcs = {dw_cublas_codelet_update_u12},
 #endif
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &model_12
 	.model = &model_12
 };
 };
@@ -144,6 +145,7 @@ static struct starpu_codelet cl21 =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21},
 	.cuda_funcs = {dw_cublas_codelet_update_u21},
 #endif
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &model_21
 	.model = &model_21
 };
 };
@@ -186,6 +188,7 @@ static struct starpu_codelet cl22 =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22},
 	.cuda_funcs = {dw_cublas_codelet_update_u22},
 #endif
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
 	.nbuffers = 3,
 	.model = &model_22
 	.model = &model_22
 };
 };

+ 34 - 17
examples/heat/dw_factolu_kernels.c

@@ -20,6 +20,13 @@
  */
  */
 #include "dw_factolu.h"
 #include "dw_factolu.h"
 
 
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#include <starpu_cublas_v2.h>
+static const float p1 =  1.0;
+static const float m1 = -1.0;
+#endif
+
 unsigned count_11_per_worker[STARPU_NMAXWORKERS] = {0};
 unsigned count_11_per_worker[STARPU_NMAXWORKERS] = {0};
 unsigned count_12_per_worker[STARPU_NMAXWORKERS] = {0};
 unsigned count_12_per_worker[STARPU_NMAXWORKERS] = {0};
 unsigned count_21_per_worker[STARPU_NMAXWORKERS] = {0};
 unsigned count_21_per_worker[STARPU_NMAXWORKERS] = {0};
@@ -134,10 +141,10 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			status = cublasSgemm(starpu_cublas_get_local_handle(),
-			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
+					CUBLAS_OP_N, CUBLAS_OP_N,
-					right, ld12, 1.0f, center, ld22);
+					dx, dy, dz, &m1, left, ld21,
-			status = cublasGetError();
+					right, ld12, &p1, center, ld22);
 			if (status != CUBLAS_STATUS_SUCCESS)
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
 
@@ -198,10 +205,10 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			status = cublasStrsm(starpu_cublas_get_local_handle(),
-			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
-					1.0f, sub11, ld11, sub12, ld12);
+					ny12, nx12,
-			status = cublasGetError();
+					&p1, sub11, ld11, sub12, ld12);
 			if (status != CUBLAS_STATUS_SUCCESS)
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
 
@@ -260,9 +267,9 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			status = cublasStrsm(starpu_cublas_get_local_handle(),
-			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
-			status = cublasGetError();
+					ny21, nx21, &p1, sub11, ld11, sub21, ld21);
 			if (status != CUBLAS_STATUS_SUCCESS)
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
 
@@ -322,6 +329,12 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
 
 	unsigned long z;
 	unsigned long z;
 
 
+#ifdef STARPU_USE_CUDA
+	cudaStream_t stream;
+	cublasHandle_t handle;
+	cublasStatus_t status;
+#endif
+
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
@@ -341,24 +354,28 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			stream = starpu_cuda_get_local_stream();
+			handle = starpu_cublas_get_local_handle();
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				float pivot;
 				float pivot;
-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(stream);
 
 
 				STARPU_ASSERT(pivot != 0.0f);
 				STARPU_ASSERT(pivot != 0.0f);
+				float scal = 1.0f/pivot;
 
 
-				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
+				status = cublasSscal(starpu_cublas_get_local_handle(),
+						nx - z - 1, &scal, &sub11[z+(z+1)*ld], ld);
 
 
-				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
+				status = cublasSger(starpu_cublas_get_local_handle(),
+						nx - z - 1, nx - z - 1, &m1,
 								&sub11[z+(z+1)*ld], ld,
 								&sub11[z+(z+1)*ld], ld,
 								&sub11[(z+1)+z*ld], 1,
 								&sub11[(z+1)+z*ld], 1,
 								&sub11[(z+1) + (z+1)*ld],ld);
 								&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			}
 
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 
 
 			break;
 			break;
 #endif
 #endif

+ 3 - 0
examples/heat/dw_sparse_cg.c

@@ -241,6 +241,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
 	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
+	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 #endif
 	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
 	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
 	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
 	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
@@ -259,6 +260,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
 	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
+	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 #endif
 	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
 	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
 	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
 	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
@@ -292,6 +294,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
 	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
+	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 #endif
 	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
 	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
 	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
 	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";

+ 0 - 4
examples/heat/dw_sparse_cg.h

@@ -29,10 +29,6 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
-
 #include "../common/blas.h"
 #include "../common/blas.h"
 
 
 #define MAXITER	100000
 #define MAXITER	100000

+ 30 - 13
examples/heat/dw_sparse_cg_kernels.c

@@ -17,6 +17,10 @@
 
 
 #include "dw_sparse_cg.h"
 #include "dw_sparse_cg.h"
 
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cublas_v2.h>
+#endif
+
 /*
 /*
  *	Algorithm :
  *	Algorithm :
  *		
  *		
@@ -146,8 +150,10 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vec, 1, vec, 1, &dot);
-	dot = cublasSdot (size, vec, 1, vec, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 	pb->delta_new = dot;
 	pb->delta_new = dot;
 	pb->delta_0 = dot;
 	pb->delta_0 = dot;
@@ -239,8 +245,10 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecd, 1, vecq, 1, &dot);
-	dot = cublasSdot (size, vecd, 1, vecq, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 	pb->alpha = pb->delta_new / dot;
 	pb->alpha = pb->delta_new / dot;
 }
 }
@@ -283,8 +291,9 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &pb->alpha, vecd, 1, vecx, 1);
-	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 
@@ -323,8 +332,11 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	starpu_cublas_set_stream();
+	float scal = -pb->alpha;
-	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
+
+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecq, 1, vecr, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 
@@ -367,8 +379,8 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	starpu_cublas_set_stream();
+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
-	dot = cublasSdot (size, vecr, 1, vecr, 1);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 	pb->delta_old = pb->delta_new;
 	pb->delta_old = pb->delta_new;
 	pb->delta_new = dot;
 	pb->delta_new = dot;
@@ -416,11 +428,16 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
 
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 	size = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	starpu_cublas_set_stream();
 	/* d = beta d */
 	/* d = beta d */
-	cublasSscal(size, pb->beta, vecd, 1);
+	cublasStatus_t status;
+	status = cublasSscal(starpu_cublas_get_local_handle(), size, &pb->beta, vecd, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 
 
 	/* d = r + d */
 	/* d = r + d */
-	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
+	float scal = 1.0f;
+	status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecr, 1, vecd, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif

+ 0 - 3
examples/lu/xlu.h

@@ -20,9 +20,6 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <common/blas.h>
 #include <common/blas.h>
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
 
 
 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
 #define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
 #define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\

+ 86 - 32
examples/lu/xlu_kernels.c

@@ -21,6 +21,11 @@
 #include <math.h>
 #include <math.h>
 #include <complex.h>
 #include <complex.h>
 
 
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#include <starpu_cublas_v2.h>
+#endif
+
 #define str(s) #s
 #define str(s) #s
 #define xstr(s)        str(s)
 #define xstr(s)        str(s)
 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
@@ -65,12 +70,11 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
 		{
 		{
-			starpu_cublas_set_stream();
+			status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
-			CUBLAS_GEMM('n', 'n', dx, dy, dz,
+				CUBLAS_OP_N, CUBLAS_OP_N, dx, dy, dz,
-				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
+				(CUBLAS_TYPE *)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
-				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
+				(CUBLAS_TYPE *)&p1, (CUBLAS_TYPE *)center, ld22);
 
 
-			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
 
@@ -186,11 +190,11 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
-			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
+					ny12, nx12,
+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
 
 
-			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
 
@@ -273,11 +277,11 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
-			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
+					ny21, nx21,
+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
 
 
-			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
 
@@ -345,6 +349,12 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
 
 	unsigned long z;
 	unsigned long z;
 
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cublasHandle_t handle;
+	cudaStream_t stream;
+#endif
+
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
@@ -369,13 +379,14 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			handle = starpu_cublas_get_local_handle();
+			stream = starpu_cuda_get_local_stream();
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				TYPE pivot;
 				TYPE pivot;
 				TYPE inv_pivot;
 				TYPE inv_pivot;
-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(stream);
 
 
 #ifdef COMPLEX_LU
 #ifdef COMPLEX_LU
 				STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
 				STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
@@ -385,15 +396,23 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 #endif
 #endif
 				
 				
 				inv_pivot = 1.0/pivot;
 				inv_pivot = 1.0/pivot;
-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				status = CUBLAS_SCAL(handle,
+						nx - z - 1,
+						(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 				
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+				status = CUBLAS_GER(handle,
+						nx - z - 1, nx - z - 1,
+						(CUBLAS_TYPE*)&m1,
 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 			}
 			}
 			
 			
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 
 
 			break;
 			break;
 #endif
 #endif
@@ -462,6 +481,12 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 	unsigned *ipiv = piv->piv;
 	unsigned *ipiv = piv->piv;
 	unsigned first = piv->first;
 	unsigned first = piv->first;
 
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cublasHandle_t handle;
+	cudaStream_t stream;
+#endif
+
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
@@ -500,44 +525,63 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			handle = starpu_cublas_get_local_handle();
+			stream = starpu_cuda_get_local_stream();
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				TYPE pivot;
 				TYPE pivot;
 				TYPE inv_pivot;
 				TYPE inv_pivot;
-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(stream);
 
 
 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
 				{
 				{
 					/* find the pivot */
 					/* find the pivot */
-					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
+					int piv_ind;
+					status = CUBLAS_IAMAX(handle,
+						nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld, &piv_ind);
+					piv_ind -= 1;
+					if (status != CUBLAS_STATUS_SUCCESS)
+						STARPU_CUBLAS_REPORT_ERROR(status);
 	
 	
 					ipiv[z + first] = piv_ind + z + first;
 					ipiv[z + first] = piv_ind + z + first;
 
 
 					/* swap if needed */
 					/* swap if needed */
 					if (piv_ind != 0)
 					if (piv_ind != 0)
 					{
 					{
-						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
+						status = CUBLAS_SWAP(handle,
+							nx,
+							(CUBLAS_TYPE*)&sub11[z*ld], 1,
+							(CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
+						if (status != CUBLAS_STATUS_SUCCESS)
+							STARPU_CUBLAS_REPORT_ERROR(status);
 					}
 					}
 
 
-					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
-					cudaStreamSynchronize(starpu_cuda_get_local_stream());
+					cudaStreamSynchronize(stream);
 				}
 				}
 
 
 				STARPU_ASSERT(pivot != 0.0);
 				STARPU_ASSERT(pivot != 0.0);
 				
 				
 				inv_pivot = 1.0/pivot;
 				inv_pivot = 1.0/pivot;
-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				status = CUBLAS_SCAL(handle,
+						nx - z - 1,
+						(CUBLAS_TYPE*)&inv_pivot,
+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 				
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+				status = CUBLAS_GER(handle,
+						nx - z - 1, nx - z - 1,
+						(CUBLAS_TYPE*)&m1,
 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
-				
+				if (status != CUBLAS_STATUS_SUCCESS)
+						STARPU_CUBLAS_REPORT_ERROR(status);
 			}
 			}
 
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
 
 
 			break;
 			break;
 #endif
 #endif
@@ -605,6 +649,11 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 	unsigned *ipiv = piv->piv;
 	unsigned *ipiv = piv->piv;
 	unsigned first = piv->first;
 	unsigned first = piv->first;
 
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cublasHandle_t handle;
+#endif
+
 	switch (s)
 	switch (s)
 	{
 	{
 		case 0:
 		case 0:
@@ -619,13 +668,18 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			handle = starpu_cublas_get_local_handle();
 			for (row = 0; row < nx; row++)
 			for (row = 0; row < nx; row++)
 			{
 			{
 				unsigned rowpiv = ipiv[row+first] - first;
 				unsigned rowpiv = ipiv[row+first] - first;
 				if (rowpiv != row)
 				if (rowpiv != row)
 				{
 				{
-					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
+					status = CUBLAS_SWAP(handle,
+							nx,
+							(CUBLAS_TYPE*)&matrix[row*ld], 1,
+							(CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
+					if (status != CUBLAS_STATUS_SUCCESS)
+						STARPU_CUBLAS_REPORT_ERROR(status);
 				}
 				}
 			}
 			}
 
 

+ 11 - 4
examples/mult/xgemm.c

@@ -36,7 +36,10 @@
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
+static const TYPE p1 = 1.0;
+static const TYPE m1 = -1.0;
+static const TYPE v0 = 0.0;
 #endif
 #endif
 
 
 static unsigned niter = 10;
 static unsigned niter = 10;
@@ -161,9 +164,13 @@ static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
 
 
-	starpu_cublas_set_stream();
+	cublasStatus_t status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
-	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
+			CUBLAS_OP_N, CUBLAS_OP_N,
-				     (TYPE)0.0, subC, ldC);
+			nxC, nyC, nyA,
+			&p1, subA, ldA, subB, ldB,
+			&v0, subC, ldC);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
 }
 }
 #endif
 #endif
 
 

+ 6 - 7
examples/pipeline/pipeline.c

@@ -35,7 +35,7 @@
 #include <common/blas.h>
 #include <common/blas.h>
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
 #endif
 #endif
 
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
@@ -100,10 +100,9 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
 	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	float alpha = 1.;
 
 
-	starpu_cublas_set_stream();
+	cublasSaxpy(starpu_cublas_get_local_handle(), n, &alpha, x, 1, y, 1);
-	cublasSaxpy(n, 1., x, 1, y, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 
@@ -119,6 +118,7 @@ static struct starpu_codelet pipeline_codelet_axpy =
 	.cpu_funcs_name = {"pipeline_cpu_axpy"},
 	.cpu_funcs_name = {"pipeline_cpu_axpy"},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {pipeline_cublas_axpy},
 	.cuda_funcs = {pipeline_cublas_axpy},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
 	.modes = {STARPU_R, STARPU_RW},
@@ -144,10 +144,8 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 	float y;
 	float y;
 
 
-	starpu_cublas_set_stream();
+	cublasSasum(starpu_cublas_get_local_handle(), n, x, 1, &y);
-	y = cublasSasum(n, x, 1);
 
 
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
 	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
 }
 }
 #endif
 #endif
@@ -164,6 +162,7 @@ static struct starpu_codelet pipeline_codelet_sum =
 	.cpu_funcs_name = {"pipeline_cpu_sum"},
 	.cpu_funcs_name = {"pipeline_cpu_sum"},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {pipeline_cublas_sum},
 	.cuda_funcs = {pipeline_cublas_sum},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.modes = {STARPU_R},
 	.modes = {STARPU_R},

+ 13 - 11
examples/reductions/dot_product.c

@@ -29,7 +29,7 @@
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
-#include <cublas.h>
+#include <starpu_cublas_v2.h>
 #endif
 #endif
 
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
@@ -245,7 +245,7 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 void dot_cuda_func(void *descr[], void *cl_arg)
 void dot_cuda_func(void *descr[], void *cl_arg)
 {
 {
 	DOT_TYPE current_dot;
 	DOT_TYPE current_dot;
-	DOT_TYPE local_dot;
+	float local_dot;
 
 
 	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
 	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -256,9 +256,10 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
-	if (cublas_version >= 7050)
+	cublasStatus_t status = cublasSdot(starpu_cublas_get_local_handle(), n, local_x, 1, local_y, 1, &local_dot);
-		starpu_cublas_set_stream();
+	if (status != CUBLAS_STATUS_SUCCESS)
-	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
+		STARPU_CUBLAS_REPORT_ERROR(status);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
 	current_dot += local_dot;
 	current_dot += local_dot;
@@ -358,14 +359,15 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	/* cublasSdot has synchronization issues when using a non-blocking stream (Nvidia bugid 1669886) */
+	cublasHandle_t handle;
-	cublasGetVersion(&cublas_version);
+	cublasCreate(&handle);
+	cublasGetVersion(handle, &cublas_version);
+	cublasDestroy(handle);
 	if (cublas_version >= 7050)
 	if (cublas_version >= 7050)
 		starpu_cublas_init();
 		starpu_cublas_init();
-	if (starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1) > 1
+	else
-	 && starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0) == 1)
+		/* Disable the sdot cublas kernel, it is bogus with a
-		/* Disable the sdot cublas kernel, it is bogus with concurrent
+		 * non-blocking stream (Nvidia bugid 1669886) */
-		 * multistream execution (Nvidia bugid 1881192) */
 		dot_codelet.cuda_funcs[0] = NULL;
 		dot_codelet.cuda_funcs[0] = NULL;
 #endif
 #endif
 
 

+ 0 - 4
examples/sched_ctx/gpu_partition.c

@@ -26,10 +26,6 @@
 
 
 #include <common/blas.h>
 #include <common/blas.h>
 
 
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
-
 
 
 #define N	512*512
 #define N	512*512
 #define NITER   100
 #define NITER   100

+ 2 - 0
examples/spmv/dw_block_spmv.c

@@ -167,6 +167,7 @@ struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_block_spmv},
 	.cuda_funcs = {cublas_block_spmv},
 #endif
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW}
 	.modes = {STARPU_R, STARPU_R, STARPU_RW}
 };
 };
@@ -320,6 +321,7 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 	if (ret == -ENODEV)
 	if (ret == -ENODEV)
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_cublas_init();
 
 
 	sem_init(&sem, 0, 0U);
 	sem_init(&sem, 0, 0U);
 
 

+ 0 - 4
examples/spmv/dw_block_spmv.h

@@ -28,10 +28,6 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
-
 void cpu_block_spmv(void *descr[], void *_args);
 void cpu_block_spmv(void *descr[], void *_args);
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 10 - 2
examples/spmv/dw_block_spmv_kernels.c

@@ -24,6 +24,12 @@
  *   U22 
  *   U22 
  */
  */
 
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cublas_v2.h>
+static const float p1 =  1.0;
+static const float m1 = -1.0;
+#endif
+
 static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
 static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 {
 	/* printf("22\n"); */
 	/* printf("22\n"); */
@@ -43,8 +49,10 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
-			starpu_cublas_set_stream();
+			cublasStatus_t status = cublasSgemv (starpu_cublas_get_local_handle(),
-			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
+					CUBLAS_OP_T, dx, dy, &p1, block, ld, in, 1, &p1, out, 1);
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_CUBLAS_REPORT_ERROR(status);
 			break;
 			break;
 #endif
 #endif
 		default:
 		default:

+ 34 - 0
include/starpu_cublas_v2.h

@@ -0,0 +1,34 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_CUBLAS_V2_H__
+#define __STARPU_CUBLAS_V2_H__
+
+#include <cublas_v2.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+cublasHandle_t starpu_cublas_get_local_handle(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_CUBLAS_V2_H__ */

+ 0 - 1
src/datawizard/copy_driver.h

@@ -29,7 +29,6 @@
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime.h>
-#include <cublas.h>
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL

+ 21 - 0
src/drivers/cuda/starpu_cublas.c

@@ -22,8 +22,11 @@
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cublas.h>
 #include <cublas.h>
+#include <starpu_cublas_v2.h>
 
 
 static int cublas_initialized[STARPU_NMAXWORKERS];
 static int cublas_initialized[STARPU_NMAXWORKERS];
+static cublasHandle_t cublas_handles[STARPU_NMAXWORKERS];
+static cublasHandle_t main_handle;
 static starpu_pthread_mutex_t mutex;
 static starpu_pthread_mutex_t mutex;
 
 
 static unsigned get_idx(void) {
 static unsigned get_idx(void) {
@@ -51,6 +54,9 @@ static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 			STARPU_CUBLAS_REPORT_ERROR(cublasst);
 			STARPU_CUBLAS_REPORT_ERROR(cublasst);
 	}
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	cublasCreate(&cublas_handles[starpu_worker_get_id_check()]);
+	cublasSetStream(cublas_handles[starpu_worker_get_id_check()], starpu_cuda_get_local_stream());
 }
 }
 
 
 static void set_cublas_stream_func(void *args STARPU_ATTRIBUTE_UNUSED)
 static void set_cublas_stream_func(void *args STARPU_ATTRIBUTE_UNUSED)
@@ -65,6 +71,8 @@ static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 	if (!--cublas_initialized[idx])
 	if (!--cublas_initialized[idx])
 		cublasShutdown();
 		cublasShutdown();
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	cublasDestroy(cublas_handles[starpu_worker_get_id_check()]);
 }
 }
 #endif
 #endif
 
 
@@ -73,6 +81,8 @@ void starpu_cublas_init(void)
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
 	starpu_execute_on_each_worker(set_cublas_stream_func, NULL, STARPU_CUDA);
 	starpu_execute_on_each_worker(set_cublas_stream_func, NULL, STARPU_CUDA);
+
+	cublasCreate(&main_handle);
 #endif
 #endif
 }
 }
 
 
@@ -80,6 +90,8 @@ void starpu_cublas_shutdown(void)
 {
 {
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
+
+	cublasDestroy(main_handle);
 #endif
 #endif
 }
 }
 
 
@@ -92,3 +104,12 @@ void starpu_cublas_set_stream(void)
 		cublasSetKernelStream(starpu_cuda_get_local_stream());
 		cublasSetKernelStream(starpu_cuda_get_local_stream());
 #endif
 #endif
 }
 }
+
+cublasHandle_t starpu_cublas_get_local_handle(void)
+{
+	int workerid = starpu_worker_get_id();
+	if (workerid >= 0)
+		return cublas_handles[workerid];
+	else
+		return main_handle;
+}

+ 9 - 14
tests/microbenchs/matrix_as_vector.c

@@ -18,7 +18,7 @@
 #include "../helper.h"
 #include "../helper.h"
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-#  include <cublas.h>
+#  include <starpu_cublas_v2.h>
 #endif
 #endif
 
 
 /*
 /*
@@ -55,8 +55,9 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	starpu_cublas_set_stream();
+	float sum;
-	float sum = cublasSasum(nx, matrix, 1);
+	cublasSasum(starpu_cublas_get_local_handle(), nx, matrix, 1, &sum);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	sum /= nx;
 	sum /= nx;
 
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
@@ -87,7 +88,9 @@ void matrix_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 
 
-	float sum = cublasSasum(nx*ny, matrix, 1);
+	float sum;
+	cublasSasum(starpu_cublas_get_local_handle(), nx*ny, matrix, 1, &sum);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	sum /= nx*ny;
 	sum /= nx*ny;
 
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
@@ -218,12 +221,7 @@ int main(int argc, char **argv)
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-#ifdef STARPU_USE_CUDA
+	starpu_cublas_init();
-	/* cublasSasum has synchronization issues when using a non-blocking stream */
-	cublasGetVersion(&cublas_version);
-	if (cublas_version >= 7050)
-		starpu_cublas_init();
-#endif
 
 
 	devices = starpu_cpu_worker_get_count();
 	devices = starpu_cpu_worker_get_count();
 	if (devices)
 	if (devices)
@@ -246,11 +244,8 @@ int main(int argc, char **argv)
 
 
 error:
 error:
 	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
 	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
-#ifdef STARPU_USE_CUDA
-	if (cublas_version >= 7050)
-		starpu_cublas_shutdown();
-#endif
 
 
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 	STARPU_RETURN(ret);
 	STARPU_RETURN(ret);
 }
 }