소스 검색

Implement a (really naive) CUDA kernel that computes pi with a Monte Carlo
method.

Cédric Augonnet 15 년 전
부모
커밋
76f2fd6053
4개의 변경된 파일133개의 추가작업 그리고 10개의 파일을 삭제
  1. 6 0
      examples/Makefile.am
  2. 6 10
      examples/pi/pi.c
  3. 32 0
      examples/pi/pi.h
  4. 89 0
      examples/pi/pi_kernel.cu

+ 6 - 0
examples/Makefile.am

@@ -165,6 +165,12 @@ examplebin_PROGRAMS +=				\
 pi_pi_SOURCES =					\
 	pi/pi.c
 
+if STARPU_USE_CUDA
+pi_pi_SOURCES +=				\
+	pi/pi_kernel.cu
+endif
+
+
 ################
 # AXPY example #
 ################

+ 6 - 10
examples/pi/pi.c

@@ -14,15 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu.h>
-#include <stdio.h>
+#include "pi.h"
 
-#define NTASKS	(32*1024)
-#define NSHOT_PER_TASK	1024
-
-#define SIZE	(NTASKS*NSHOT_PER_TASK)
-
-#define TYPE	float
+#ifdef STARPU_USE_CUDA
+void cuda_kernel(void **descr, void *cl_arg);
+#endif
 
 static void cpu_kernel(void *descr[], void *cl_arg)
 {
@@ -95,9 +91,9 @@ int main(int argc, char **argv)
 	starpu_partition_data(cnt_array_handle, &f);
 
 	struct starpu_codelet_t cl = {
-		.where = STARPU_CPU,
+		.where = STARPU_CPU|STARPU_CUDA,
 		.cpu_func = cpu_kernel,
-		.cuda_func = NULL, /* TODO */
+		.cuda_func = cuda_kernel,
 		.nbuffers = 3,
 		.model = NULL /* TODO */
 	};

+ 32 - 0
examples/pi/pi.h

@@ -0,0 +1,32 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PI_H__
+#define __PI_H__
+
+#include <starpu.h>
+#include <stdio.h>
+
+#define NTASKS	(32*1024)
+#define NSHOT_PER_TASK	1024
+
+#define SIZE	(NTASKS*NSHOT_PER_TASK)
+
+#define TYPE	float
+
+//extern "C" void cuda_kernel(void *descr[], void *cl_arg);
+
+#endif // __PI_H__

+ 89 - 0
examples/pi/pi_kernel.cu

@@ -0,0 +1,89 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pi.h"
+
+#define BLOCK_SIZE	256
+
+static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
+						unsigned n, unsigned *output_cnt)
+{
+	__shared__ unsigned scnt[BLOCK_SIZE];
+
+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+	/* Blank the shared mem buffer */
+	if (threadIdx.x < 32)
+		scnt[threadIdx.x] = 0;
+
+	__syncthreads();
+
+	/* Do we have a successful shot ? */
+	TYPE x = random_numbers_x[tid];
+	TYPE y = random_numbers_y[tid];
+	TYPE dist = (x*x + y*y);
+
+	scnt[threadIdx.x] = (dist <= 1.0f);
+
+	__syncthreads();
+
+	/* XXX that's totally unoptimized : we should do a reduction ! */
+	if (threadIdx.x == 0)
+	{
+		unsigned total_cnt = 0;
+		unsigned i;
+		for (i = 0; i < BLOCK_SIZE; i++)
+			total_cnt += scnt[i];
+
+		output_cnt[blockIdx.x] = total_cnt;
+	}
+}
+
+static __global__ void sum_per_block_cnt(unsigned previous_nblocks,
+						unsigned *output_cnt,
+						unsigned *cnt)
+{
+	/* XXX that's totally unoptimized yet : we should do a reduction ! */
+	if (threadIdx.x == 0)
+	{
+		unsigned total_cnt = 0;
+		unsigned i;
+		for (i = 0; i < previous_nblocks; i++)
+			total_cnt += output_cnt[i];
+
+		*cnt = total_cnt;
+	}
+}
+
+extern "C" void cuda_kernel(void *descr[], void *cl_arg)
+{
+	TYPE *random_numbers_x = (TYPE *)STARPU_GET_VECTOR_PTR(descr[0]);
+	TYPE *random_numbers_y = (TYPE *)STARPU_GET_VECTOR_PTR(descr[1]);
+	unsigned nx = STARPU_GET_VECTOR_NX(descr[0]);
+
+	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[2]);
+	
+	unsigned *per_block_cnt;
+	cudaMalloc((void **)&per_block_cnt, (nx/BLOCK_SIZE)*sizeof(unsigned));
+
+	monte_carlo<<<nx/BLOCK_SIZE, BLOCK_SIZE>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+	cudaThreadSynchronize();
+
+	sum_per_block_cnt<<<1, 32>>>(nx/BLOCK_SIZE, per_block_cnt, cnt);
+	cudaThreadSynchronize();
+
+	cudaFree(per_block_cnt);
+}