Sfoglia il codice sorgente

Provide good examples by always using streams for cublas

Samuel Thibault 13 anni fa
parent
commit
29b7f32f77

+ 2 - 1
examples/axpy/axpy.c

@@ -27,6 +27,7 @@
 
 #ifdef STARPU_USE_CUDA
 #include <cublas.h>
+#include <starpu_cuda.h>
 #endif
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
@@ -74,7 +75,7 @@ void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 2 - 2
examples/cholesky/cholesky_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -196,7 +196,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}
-			cudaError_t cures = cudaThreadSynchronize();
+			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			STARPU_ASSERT(!cures);
 			}
 #else

+ 4 - 4
examples/heat/dw_factolu_kernels.c

@@ -137,7 +137,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attr
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -202,7 +202,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, __attribut
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -264,7 +264,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, __attribut
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -357,7 +357,7 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 								&sub11[(z+1) + (z+1)*ld],ld);
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif

+ 1 - 0
examples/heat/heat.c

@@ -788,6 +788,7 @@ int main(int argc, char **argv)
 		if (check)
 			solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
 
+		starpu_helper_cublas_shutdown();
 		starpu_shutdown();
 		free_system(A, B, newsize, pinned);
 	}

+ 6 - 6
examples/lu/xlu_kernels.c

@@ -70,7 +70,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -158,7 +158,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -243,7 +243,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -342,7 +342,7 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -482,7 +482,7 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 				
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -570,7 +570,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				}
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif

+ 4 - 4
gcc-plugin/examples/cholesky/cholesky_kernels.c

@@ -53,7 +53,7 @@ static inline void chol_common_cpu_codelet_update_u22(const float *left, const f
 			st = cublasGetError();
 			STARPU_ASSERT(!st);
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -99,7 +99,7 @@ static inline void chol_common_codelet_update_u21(const float *sub11, float *sub
 #ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			break;
 #endif
 		default:
@@ -170,7 +170,7 @@ static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, uns
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					STARPU_ABORT();
 				}
-				cudaError_t cures = cudaThreadSynchronize();
+				cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 				STARPU_ASSERT(!cures);
 			}
 #else
@@ -193,7 +193,7 @@ static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, uns
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 			break;
 #endif

+ 5 - 4
mpi/examples/cholesky/mpi_cholesky_kernels.c

@@ -22,6 +22,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
+#include <starpu_cuda.h>
 #ifdef STARPU_HAVE_MAGMA
 #include "magma.h"
 #include "magma_lapack.h"
@@ -65,7 +66,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 			st = cublasGetError();
 			STARPU_ASSERT(!st);
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -114,7 +115,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 #ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			break;
 #endif
 		default:
@@ -188,7 +189,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					STARPU_ABORT();
 				}
-				cudaError_t cures = cudaThreadSynchronize();
+				cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 				STARPU_ASSERT(!cures);
 			}
 #else
@@ -211,7 +212,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 			break;
 #endif

+ 4 - 4
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -70,7 +70,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -175,7 +175,7 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -282,7 +282,7 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -394,7 +394,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 						&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif

+ 4 - 1
mpi/examples/mpi_lu/pxlu_kernels.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,6 +19,9 @@
 #define __PXLU_KERNELS_H__
 
 #include <starpu.h>
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 
 #define str(s) #s
 #define xstr(s)        str(s)

+ 1 - 1
tests/experiments/latency/cuda_latency.c

@@ -96,7 +96,7 @@ void recv_data(unsigned src, unsigned dst)
 #ifdef ASYNC
 	cures = cudaMemcpyAsync(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice, stream[dst]);
 	STARPU_ASSERT(!cures);
-	cures = cudaThreadSynchronize();
+	cures = cudaStreamSynchronize(stream[dst]);
 	STARPU_ASSERT(!cures);
 #else
 	cures = cudaMemcpy(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice);

+ 2 - 2
tests/microbenchs/matrix_as_vector.c

@@ -47,7 +47,6 @@ void vector_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 	float sum = cublasSasum(nx, matrix, 1);
-	cudaThreadSynchronize();
 	sum /= nx;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
@@ -79,7 +78,6 @@ void matrix_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 
 	float sum = cublasSasum(nx*ny, matrix, 1);
-	cudaThreadSynchronize();
 	sum /= nx*ny;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
@@ -196,6 +194,7 @@ int main(int argc, char **argv)
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_helper_cublas_init();
 
 	devices = starpu_cpu_worker_get_count();
 	if (devices)
@@ -220,6 +219,7 @@ int main(int argc, char **argv)
 
 error:
 	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
+	starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 	STARPU_RETURN(ret);
 }