Przeglądaj źródła

Bug fix: use a kernel to actually put zeros in the vector that is initialized
before a reduction. We need a kernel to do that instead of cublasSscal because
nan * 0 != 0.

Cédric Augonnet 14 lat temu
rodzic
commit
48aea5e024
3 zmienionych plików z 25 dodań i 6 usunięć
  1. 0 4
      examples/cg/cg.c
  2. 19 0
      examples/cg/cg_dot_kernel.cu
  3. 6 2
      examples/cg/cg_kernels.c

+ 0 - 4
examples/cg/cg.c

@@ -92,8 +92,6 @@ extern starpu_codelet bzero_vector_cl;
 
 static void generate_random_problem(void)
 {
-	srand48(0xdeadbeef);
-
 	int i, j;
 
 	starpu_data_malloc_pinned_if_possible((void **)&A, n*n*sizeof(TYPE));
@@ -101,14 +99,12 @@ static void generate_random_problem(void)
 	starpu_data_malloc_pinned_if_possible((void **)&x, n*sizeof(TYPE));
 	assert(A && b && x);
 
-	/* Create a random matrix (A) and two random vectors (x and b) */
 	for (j = 0; j < n; j++)
 	{
 		b[j] = (TYPE)1.0;
 		x[j] = (TYPE)0.0;
 
 		/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
-
 		for (i = 0; i < n; i++)
 		{
 			A[n*j + i] = (TYPE)(1.0/(1.0+i+j));

+ 19 - 0
examples/cg/cg_dot_kernel.cu

@@ -126,3 +126,22 @@ extern "C" void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot)
 
 	cudaFree(per_block_sum);
 }
+
+static __global__ void zero_vector_device(TYPE *x, unsigned nelems, unsigned nelems_per_thread)
+{
+	unsigned i;
+	unsigned first_i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	for (i = first_i; i < nelems; i += nelems_per_thread)
+		x[i] = 0.0;
+}
+
+extern "C" void zero_vector(TYPE *x, unsigned nelems)
+{
+	unsigned nblocks = STARPU_MIN(128, nelems);
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
+
+	unsigned nelems_per_thread = nelems / (nblocks * nthread_per_block);
+
+	zero_vector_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, nelems, nelems_per_thread);
+}

+ 6 - 2
examples/cg/cg_kernels.c

@@ -123,11 +123,14 @@ starpu_codelet accumulate_vector_cl = {
  */
 
 #ifdef STARPU_USE_CUDA
+extern void zero_vector(TYPE *x, unsigned nelems);
+
 static void bzero_variable_cuda(void *descr[], void *cl_arg)
 {
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	zero_vector(v, 1);
  
-	cublasscal (1, (TYPE)0.0, v, 1);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
@@ -159,7 +162,8 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
-	cublasscal (n, (TYPE)0.0, v, 1);
+	zero_vector(v, n);
+
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif