Browse Source

Instead of using cublasSdot or cublasDdot, we reimplement scalar product in
CUDA not the have our results back in main memory (transfering the result back
takes too long and imposes some synchronous data transfers).

Cédric Augonnet 14 years ago
parent
commit
1618f94c1f
3 changed files with 150 additions and 36 deletions
  1. 7 1
      examples/Makefile.am
  2. 128 0
      examples/cg/cg_dot_kernel.cu
  3. 15 35
      examples/cg/cg_kernels.c

+ 7 - 1
examples/Makefile.am

@@ -61,7 +61,7 @@ CLEANFILES += *.gcno *.gcda *.linkinfo
 
 if STARPU_USE_CUDA
 
-NVCCFLAGS += --compiler-options -fno-strict-aliasing  $(HWLOC_CFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  $(HWLOC_CFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  -arch sm_13
 
 .cu.o:
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
@@ -458,6 +458,12 @@ cg_cg_SOURCES =					\
 	cg/cg.c					\
 	cg/cg_kernels.c				\
 	common/blas.c
+
+if STARPU_USE_CUDA
+cg_cg_SOURCES +=				\
+	cg/cg_dot_kernel.cu
+endif
+
 endif
 
 

+ 128 - 0
examples/cg/cg_dot_kernel.cu

@@ -0,0 +1,128 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+#include "cg.h"
+
+#define MAXNBLOCKS	128
+#define MAXTHREADSPERBLOCK	256
+
+static __global__ void dot_device(TYPE *vx, TYPE *vy, unsigned n, TYPE *dot_array)
+{
+	__shared__ TYPE scnt[MAXTHREADSPERBLOCK];
+
+	/* Do we have a successful shot ? */
+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+	const int nthreads = gridDim.x * blockDim.x;
+
+	/* Blank the shared mem buffer */
+	if (threadIdx.x < MAXTHREADSPERBLOCK)
+		scnt[threadIdx.x] = (TYPE)0.0;
+
+	__syncthreads();
+
+	int ind;
+	for (ind = tid; ind < n; ind += nthreads)
+	{ 
+		TYPE x = vx[ind];
+		TYPE y = vy[ind];
+
+		scnt[threadIdx.x] += (x*y);
+	}
+
+	__syncthreads();
+
+	/* Perform a reduction to compute the sum on each thread within that block */
+
+	/* NB: We assume that the number of threads per block is a power of 2 ! */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* report the number of successful shots in the block */
+	if (threadIdx.x == 0)
+		dot_array[blockIdx.x] = scnt[0];
+
+	__syncthreads();
+}
+
+static __global__ void gather_dot_device(TYPE *dot_array, TYPE *dot)
+{
+	__shared__ TYPE accumulator[MAXNBLOCKS];
+
+	unsigned i;
+
+	/* Load the values from global mem */
+	for (i = 0; i < blockDim.x; i++)
+		accumulator[i] = dot_array[i];
+
+	__syncthreads();
+
+	/* Perform a reduction in shared memory */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+
+	/* Save the result in global memory */
+	if (threadIdx.x == 0)
+		*dot = *dot + accumulator[0];
+}
+
+extern "C" void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot)
+{
+	/* How many blocks do we use ? */ 
+	unsigned nblocks = 128; // TODO
+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
+	
+	TYPE *per_block_sum;
+	cudaMalloc((void **)&per_block_sum, nblocks*sizeof(TYPE));
+
+	STARPU_ASSERT((nelems % nblocks) == 0);
+
+	/* How many threads per block ? At most 256, but no more threads than
+	 * there are entries to process per block. */
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
+
+	/* each entry of per_block_sum contains the number of successful shots
+	 * in the corresponding block. */
+	dot_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, nelems, per_block_sum);
+
+	/* Note that we do not synchronize between kernel calls because there
+	 * is an implicit serialization */
+	gather_dot_device<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_sum, dot);
+
+	cudaError_t cures;
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	if (cures)
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cudaFree(per_block_sum);
+}

+ 15 - 35
examples/cg/cg_kernels.c

@@ -55,9 +55,7 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
 	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaError_t ret = cudaThreadSynchronize();
-	if (ret)
-		STARPU_CUDA_REPORT_ERROR(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -92,9 +90,7 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaError_t ret = cudaThreadSynchronize();
-	if (ret)
-		STARPU_CUDA_REPORT_ERROR(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -132,8 +128,7 @@ static void bzero_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  
 	cublasscal (1, (TYPE)0.0, v, 1);
-	cudaThreadSynchronize();
-
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -165,9 +160,7 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	cublasscal (n, (TYPE)0.0, v, 1);
-	cudaError_t ret = cudaThreadSynchronize();
-	if (ret)
-		STARPU_CUDA_REPORT_ERROR(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -199,33 +192,20 @@ starpu_codelet bzero_vector_cl = {
  */
 
 #ifdef STARPU_USE_CUDA
+extern void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot);
+
 static void dot_kernel_cuda(void *descr[], void *cl_arg)
 {
-	cudaError_t ret;
-
 	TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
 
 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
- 
-	/* Get current value */
-	TYPE host_dot;
-	cudaMemcpy(&host_dot, dot, sizeof(TYPE), cudaMemcpyDeviceToHost);
-	ret = cudaThreadSynchronize();
-	if (ret)
-		STARPU_CUDA_REPORT_ERROR(ret);
-
-	TYPE local_dot = cublasdot(n, v1, 1, v2, 1);
-	host_dot += local_dot;
-	ret = cudaThreadSynchronize();
-	if (ret)
-		STARPU_CUDA_REPORT_ERROR(ret);
-
-	cudaMemcpy(dot, &host_dot, sizeof(TYPE), cudaMemcpyHostToDevice);
-	ret = cudaThreadSynchronize();
-	if (ret)
-		STARPU_CUDA_REPORT_ERROR(ret);
+
+	/* Contrary to cublasSdot, this function puts its result directly in
+	 * device memory, so that we don't have to transfer that value back and
+	 * forth. */
+	dot_host(v1, v2, n, dot);
 }
 #endif
 
@@ -296,7 +276,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
 	cublasscal(n, alpha, v1, 1);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -347,7 +327,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -453,7 +433,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 */
 	cublasscal(n, p1, v1, 1);
 	cublasaxpy(n, p2, v2, 1, v1, 1);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -524,7 +504,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 	/* Compute v1 = v1 + p1 * v2.
 	 */
 	cublasaxpy(n, p1, v2, 1, v1, 1);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif